Merge branch 'hfi1-2' into k.o/for-4.7
authorDoug Ledford <dledford@redhat.com>
Thu, 26 May 2016 16:50:05 +0000 (12:50 -0400)
committerDoug Ledford <dledford@redhat.com>
Thu, 26 May 2016 16:50:05 +0000 (12:50 -0400)
138 files changed:
MAINTAINERS
drivers/infiniband/Kconfig
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/hfi1/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/hfi1/Makefile [new file with mode: 0644]
drivers/infiniband/hw/hfi1/affinity.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/affinity.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/aspm.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip_registers.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/common.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/debugfs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/debugfs.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/device.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/device.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/dma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/driver.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/efivar.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/efivar.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/eprom.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/eprom.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/file_ops.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/firmware.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/hfi.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/init.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/intr.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/iowait.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mad.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mad.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mmu_rb.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/mmu_rb.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/opa_compat.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pcie.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/pio_copy.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/platform.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/platform.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qp.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qp.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qsfp.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/qsfp.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/rc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/ruc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sdma_txreq.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/sysfs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/twsi.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/twsi.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/uc.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/ud.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_exp_rcv.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_exp_rcv.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_pages.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_sdma.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/user_sdma.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs_txreq.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/verbs_txreq.h [new file with mode: 0644]
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/qp.c
drivers/staging/rdma/Kconfig
drivers/staging/rdma/Makefile
drivers/staging/rdma/hfi1/Kconfig [deleted file]
drivers/staging/rdma/hfi1/Makefile [deleted file]
drivers/staging/rdma/hfi1/TODO [deleted file]
drivers/staging/rdma/hfi1/affinity.c [deleted file]
drivers/staging/rdma/hfi1/affinity.h [deleted file]
drivers/staging/rdma/hfi1/aspm.h [deleted file]
drivers/staging/rdma/hfi1/chip.c [deleted file]
drivers/staging/rdma/hfi1/chip.h [deleted file]
drivers/staging/rdma/hfi1/chip_registers.h [deleted file]
drivers/staging/rdma/hfi1/common.h [deleted file]
drivers/staging/rdma/hfi1/debugfs.c [deleted file]
drivers/staging/rdma/hfi1/debugfs.h [deleted file]
drivers/staging/rdma/hfi1/device.c [deleted file]
drivers/staging/rdma/hfi1/device.h [deleted file]
drivers/staging/rdma/hfi1/diag.c [deleted file]
drivers/staging/rdma/hfi1/dma.c [deleted file]
drivers/staging/rdma/hfi1/driver.c [deleted file]
drivers/staging/rdma/hfi1/efivar.c [deleted file]
drivers/staging/rdma/hfi1/efivar.h [deleted file]
drivers/staging/rdma/hfi1/eprom.c [deleted file]
drivers/staging/rdma/hfi1/eprom.h [deleted file]
drivers/staging/rdma/hfi1/file_ops.c [deleted file]
drivers/staging/rdma/hfi1/firmware.c [deleted file]
drivers/staging/rdma/hfi1/hfi.h [deleted file]
drivers/staging/rdma/hfi1/init.c [deleted file]
drivers/staging/rdma/hfi1/intr.c [deleted file]
drivers/staging/rdma/hfi1/iowait.h [deleted file]
drivers/staging/rdma/hfi1/mad.c [deleted file]
drivers/staging/rdma/hfi1/mad.h [deleted file]
drivers/staging/rdma/hfi1/mmu_rb.c [deleted file]
drivers/staging/rdma/hfi1/mmu_rb.h [deleted file]
drivers/staging/rdma/hfi1/opa_compat.h [deleted file]
drivers/staging/rdma/hfi1/pcie.c [deleted file]
drivers/staging/rdma/hfi1/pio.c [deleted file]
drivers/staging/rdma/hfi1/pio.h [deleted file]
drivers/staging/rdma/hfi1/pio_copy.c [deleted file]
drivers/staging/rdma/hfi1/platform.c [deleted file]
drivers/staging/rdma/hfi1/platform.h [deleted file]
drivers/staging/rdma/hfi1/qp.c [deleted file]
drivers/staging/rdma/hfi1/qp.h [deleted file]
drivers/staging/rdma/hfi1/qsfp.c [deleted file]
drivers/staging/rdma/hfi1/qsfp.h [deleted file]
drivers/staging/rdma/hfi1/rc.c [deleted file]
drivers/staging/rdma/hfi1/ruc.c [deleted file]
drivers/staging/rdma/hfi1/sdma.c [deleted file]
drivers/staging/rdma/hfi1/sdma.h [deleted file]
drivers/staging/rdma/hfi1/sdma_txreq.h [deleted file]
drivers/staging/rdma/hfi1/sysfs.c [deleted file]
drivers/staging/rdma/hfi1/trace.c [deleted file]
drivers/staging/rdma/hfi1/trace.h [deleted file]
drivers/staging/rdma/hfi1/twsi.c [deleted file]
drivers/staging/rdma/hfi1/twsi.h [deleted file]
drivers/staging/rdma/hfi1/uc.c [deleted file]
drivers/staging/rdma/hfi1/ud.c [deleted file]
drivers/staging/rdma/hfi1/user_exp_rcv.c [deleted file]
drivers/staging/rdma/hfi1/user_exp_rcv.h [deleted file]
drivers/staging/rdma/hfi1/user_pages.c [deleted file]
drivers/staging/rdma/hfi1/user_sdma.c [deleted file]
drivers/staging/rdma/hfi1/user_sdma.h [deleted file]
drivers/staging/rdma/hfi1/verbs.c [deleted file]
drivers/staging/rdma/hfi1/verbs.h [deleted file]
drivers/staging/rdma/hfi1/verbs_txreq.c [deleted file]
drivers/staging/rdma/hfi1/verbs_txreq.h [deleted file]
include/rdma/ib_pack.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_qp.h
include/uapi/rdma/hfi/hfi1_user.h

index c802594..9823456 100644 (file)
@@ -5086,6 +5086,13 @@ F:       drivers/block/cciss*
 F:     include/linux/cciss_ioctl.h
 F:     include/uapi/linux/cciss_ioctl.h
 
+HFI1 DRIVER
+M:     Mike Marciniszyn <mike.marciniszyn@intel.com>
+M:     Dennis Dalessandro <dennis.dalessandro@intel.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+F:     drivers/infiniband/hw/hfi1
+
 HFS FILESYSTEM
 L:     linux-fsdevel@vger.kernel.org
 S:     Orphan
@@ -10661,12 +10668,6 @@ M:     Arnaud Patard <arnaud.patard@rtp-net.org>
 S:     Odd Fixes
 F:     drivers/staging/xgifb/
 
-HFI1 DRIVER
-M:     Mike Marciniszyn <infinipath@intel.com>
-L:     linux-rdma@vger.kernel.org
-S:     Supported
-F:     drivers/staging/rdma/hfi1
-
 STARFIRE/DURALAN NETWORK DRIVER
 M:     Ion Badulescu <ionut@badula.org>
 S:     Odd Fixes
index 6425c0e..2137adf 100644 (file)
@@ -85,4 +85,6 @@ source "drivers/infiniband/ulp/isert/Kconfig"
 
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 
+source "drivers/infiniband/hw/hfi1/Kconfig"
+
 endif # INFINIBAND
index c7ad0a4..c0c7cf8 100644 (file)
@@ -8,3 +8,4 @@ obj-$(CONFIG_MLX5_INFINIBAND)           += mlx5/
 obj-$(CONFIG_INFINIBAND_NES)           += nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)                += ocrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)         += usnic/
+obj-$(CONFIG_INFINIBAND_HFI1)          += hfi1/
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
new file mode 100644 (file)
index 0000000..a925fb0
--- /dev/null
@@ -0,0 +1,29 @@
+config INFINIBAND_HFI1
+       tristate "Intel OPA Gen1 support"
+       depends on X86_64 && INFINIBAND_RDMAVT
+       select MMU_NOTIFIER
+       select CRC32
+       default m
+       ---help---
+       This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+       bool "HFI1 SDMA Order debug"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a debug flag to test for out of order
+       sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+       bool "HFI1 enable 31 bit PSN"
+       depends on INFINIBAND_HFI1
+       default y
+       ---help---
+       Setting this enables 31 BIT PSN
+       For verbs RC/UC
+config SDMA_VERBOSITY
+       bool "Config SDMA Verbosity"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a configuration flag to enable verbose
+       SDMA debug
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
new file mode 100644 (file)
index 0000000..9b5382c
--- /dev/null
@@ -0,0 +1,21 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
+       eprom.o file_ops.o firmware.o \
+       init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
+       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
+       uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
+       verbs_txreq.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
new file mode 100644 (file)
index 0000000..6e7050a
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+#include "trace.h"
+
+/* Name of IRQ types, indexed by enum irq_type */
+static const char * const irq_type_names[] = {
+       "SDMA",
+       "RCVCTXT",
+       "GENERAL",
+       "OTHER",
+};
+
+static inline void init_cpu_mask_set(struct cpu_mask_set *set)
+{
+       cpumask_clear(&set->mask);
+       cpumask_clear(&set->used);
+       set->gen = 0;
+}
+
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *dd)
+{
+       struct hfi1_affinity *info;
+       int possible, curr_cpu, i, ht;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       cpumask_clear(&info->real_cpu_mask);
+
+       /* Start with cpu online mask as the real cpu mask */
+       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+
+       /*
+        * Remove HT cores from the real cpu mask.  Do this in two steps below.
+        */
+       possible = cpumask_weight(&info->real_cpu_mask);
+       ht = cpumask_weight(topology_sibling_cpumask(
+                                       cpumask_first(&info->real_cpu_mask)));
+       /*
+        * Step 1.  Skip over the first N HT siblings and use them as the
+        * "real" cores.  Assumes that HT cores are not enumerated in
+        * succession (except in the single core case).
+        */
+       curr_cpu = cpumask_first(&info->real_cpu_mask);
+       for (i = 0; i < possible / ht; i++)
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       /*
+        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+        * skip any gaps.
+        */
+       for (; i < possible; i++) {
+               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+       }
+
+       dd->affinity = info;
+       return 0;
+}
+
+/*
+ * Interrupt affinity.
+ *
+ * non-rcv avail gets a default mask that
+ * starts as possible cpus with threads reset
+ * and each rcv avail reset.
+ *
+ * rcv avail gets node relative 1 wrapping back
+ * to the node relative 1 as necessary.
+ *
+ */
+void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+{
+       int node = pcibus_to_node(dd->pcidev->bus);
+       struct hfi1_affinity *info = dd->affinity;
+       const struct cpumask *local_mask;
+       int curr_cpu, possible, i;
+
+       if (node < 0)
+               node = numa_node_id();
+       dd->node = node;
+
+       spin_lock_init(&info->lock);
+
+       init_cpu_mask_set(&info->def_intr);
+       init_cpu_mask_set(&info->rcv_intr);
+       init_cpu_mask_set(&info->proc);
+
+       local_mask = cpumask_of_node(dd->node);
+       if (cpumask_first(local_mask) >= nr_cpu_ids)
+               local_mask = topology_core_cpumask(0);
+       /* Use the "real" cpu mask of this node as the default */
+       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
+
+       /*  fill in the receive list */
+       possible = cpumask_weight(&info->def_intr.mask);
+       curr_cpu = cpumask_first(&info->def_intr.mask);
+       if (possible == 1) {
+               /*  only one CPU, everyone will use it */
+               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
+       } else {
+               /*
+                * Retain the first CPU in the default list for the control
+                * context.
+                */
+               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
+               /*
+                * Remove the remaining kernel receive queues from
+                * the default list and add them to the receive list.
+                */
+               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
+                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
+                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
+                       if (curr_cpu >= nr_cpu_ids)
+                               break;
+               }
+       }
+
+       cpumask_copy(&info->proc.mask, cpu_online_mask);
+}
+
+void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
+{
+       kfree(dd->affinity);
+}
+
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
+{
+       int ret;
+       cpumask_var_t diff;
+       struct cpu_mask_set *set;
+       struct sdma_engine *sde = NULL;
+       struct hfi1_ctxtdata *rcd = NULL;
+       char extra[64];
+       int cpu = -1;
+
+       extra[0] = '\0';
+       cpumask_clear(&msix->mask);
+
+       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+       if (!ret)
+               return -ENOMEM;
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+               sde = (struct sdma_engine *)msix->arg;
+               scnprintf(extra, 64, "engine %u", sde->this_idx);
+               /* fall through */
+       case IRQ_GENERAL:
+               set = &dd->affinity->def_intr;
+               break;
+       case IRQ_RCVCTXT:
+               rcd = (struct hfi1_ctxtdata *)msix->arg;
+               if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                       set = &dd->affinity->def_intr;
+                       cpu = cpumask_first(&set->mask);
+               } else {
+                       set = &dd->affinity->rcv_intr;
+               }
+               scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
+               break;
+       default:
+               dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
+               return -EINVAL;
+       }
+
+       /*
+        * The control receive context is placed on a particular CPU, which
+        * is set above.  Skip accounting for it.  Everything else finds its
+        * CPU here.
+        */
+       if (cpu == -1) {
+               spin_lock(&dd->affinity->lock);
+               if (cpumask_equal(&set->mask, &set->used)) {
+                       /*
+                        * We've used up all the CPUs, bump up the generation
+                        * and reset the 'used' map
+                        */
+                       set->gen++;
+                       cpumask_clear(&set->used);
+               }
+               cpumask_andnot(diff, &set->mask, &set->used);
+               cpu = cpumask_first(diff);
+               cpumask_set_cpu(cpu, &set->used);
+               spin_unlock(&dd->affinity->lock);
+       }
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+               sde->cpu = cpu;
+               break;
+       case IRQ_GENERAL:
+       case IRQ_RCVCTXT:
+       case IRQ_OTHER:
+               break;
+       }
+
+       cpumask_set_cpu(cpu, &msix->mask);
+       dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
+                   msix->msix.vector, irq_type_names[msix->type],
+                   extra, cpu);
+       irq_set_affinity_hint(msix->msix.vector, &msix->mask);
+
+       free_cpumask_var(diff);
+       return 0;
+}
+
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+                          struct hfi1_msix_entry *msix)
+{
+       struct cpu_mask_set *set = NULL;
+       struct hfi1_ctxtdata *rcd;
+
+       switch (msix->type) {
+       case IRQ_SDMA:
+       case IRQ_GENERAL:
+               set = &dd->affinity->def_intr;
+               break;
+       case IRQ_RCVCTXT:
+               rcd = (struct hfi1_ctxtdata *)msix->arg;
+               /* only do accounting for non control contexts */
+               if (rcd->ctxt != HFI1_CTRL_CTXT)
+                       set = &dd->affinity->rcv_intr;
+               break;
+       default:
+               return;
+       }
+
+       if (set) {
+               spin_lock(&dd->affinity->lock);
+               cpumask_andnot(&set->used, &set->used, &msix->mask);
+               if (cpumask_empty(&set->used) && set->gen) {
+                       set->gen--;
+                       cpumask_copy(&set->used, &set->mask);
+               }
+               spin_unlock(&dd->affinity->lock);
+       }
+
+       irq_set_affinity_hint(msix->msix.vector, NULL);
+       cpumask_clear(&msix->mask);
+}
+
+int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
+{
+       int cpu = -1, ret;
+       cpumask_var_t diff, mask, intrs;
+       const struct cpumask *node_mask,
+               *proc_mask = tsk_cpus_allowed(current);
+       struct cpu_mask_set *set = &dd->affinity->proc;
+       char buf[1024];
+
+       /*
+        * check whether process/context affinity has already
+        * been set
+        */
+       if (cpumask_weight(proc_mask) == 1) {
+               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
+               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %s",
+                         current->pid, current->comm, buf);
+               /*
+                * Mark the pre-set CPU as used. This is atomic so we don't
+                * need the lock
+                */
+               cpu = cpumask_first(proc_mask);
+               cpumask_set_cpu(cpu, &set->used);
+               goto done;
+       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
+               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %s",
+                         current->pid, current->comm, buf);
+               goto done;
+       }
+
+       /*
+        * The process does not have a preset CPU affinity so find one to
+        * recommend. We prefer CPUs on the same NUMA as the device.
+        */
+
+       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+       if (!ret)
+               goto done;
+       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+       if (!ret)
+               goto free_diff;
+       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
+       if (!ret)
+               goto free_mask;
+
+       spin_lock(&dd->affinity->lock);
+       /*
+        * If we've used all available CPUs, clear the mask and start
+        * overloading.
+        */
+       if (cpumask_equal(&set->mask, &set->used)) {
+               set->gen++;
+               cpumask_clear(&set->used);
+       }
+
+       /* CPUs used by interrupt handlers */
+       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
+                            &dd->affinity->def_intr.mask :
+                            &dd->affinity->def_intr.used));
+       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
+                                 &dd->affinity->rcv_intr.mask :
+                                 &dd->affinity->rcv_intr.used));
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(intrs));
+       hfi1_cdbg(PROC, "CPUs used by interrupts: %s", buf);
+
+       /*
+        * If we don't have a NUMA node requested, preference is towards
+        * device NUMA node
+        */
+       if (node == -1)
+               node = dd->node;
+       node_mask = cpumask_of_node(node);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(node_mask));
+       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %s", node, buf);
+
+       /* diff will hold all unused cpus */
+       cpumask_andnot(diff, &set->mask, &set->used);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(diff));
+       hfi1_cdbg(PROC, "unused CPUs (all) %s", buf);
+
+       /* get cpumask of available CPUs on preferred NUMA */
+       cpumask_and(mask, diff, node_mask);
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "available cpus on NUMA %s", buf);
+
+       /*
+        * At first, we don't want to place processes on the same
+        * CPUs as interrupt handlers.
+        */
+       cpumask_andnot(diff, mask, intrs);
+       if (!cpumask_empty(diff))
+               cpumask_copy(mask, diff);
+
+       /*
+        * if we don't have a cpu on the preferred NUMA, get
+        * the list of the remaining available CPUs
+        */
+       if (cpumask_empty(mask)) {
+               cpumask_andnot(diff, &set->mask, &set->used);
+               cpumask_andnot(mask, diff, node_mask);
+       }
+       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "possible CPUs for process %s", buf);
+
+       cpu = cpumask_first(mask);
+       if (cpu >= nr_cpu_ids) /* empty */
+               cpu = -1;
+       else
+               cpumask_set_cpu(cpu, &set->used);
+       spin_unlock(&dd->affinity->lock);
+
+       free_cpumask_var(intrs);
+free_mask:
+       free_cpumask_var(mask);
+free_diff:
+       free_cpumask_var(diff);
+done:
+       return cpu;
+}
+
+void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
+{
+       struct cpu_mask_set *set = &dd->affinity->proc;
+
+       if (cpu < 0)
+               return;
+       spin_lock(&dd->affinity->lock);
+       cpumask_clear_cpu(cpu, &set->used);
+       if (cpumask_empty(&set->used) && set->gen) {
+               set->gen--;
+               cpumask_copy(&set->used, &set->mask);
+       }
+       spin_unlock(&dd->affinity->lock);
+}
+
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
new file mode 100644 (file)
index 0000000..20f52fe
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_AFFINITY_H
+#define _HFI1_AFFINITY_H
+
+#include "hfi.h"
+
+enum irq_type {
+       IRQ_SDMA,
+       IRQ_RCVCTXT,
+       IRQ_GENERAL,
+       IRQ_OTHER
+};
+
+/* Can be used for both memory and cpu */
+enum affinity_flags {
+       AFF_AUTO,
+       AFF_NUMA_LOCAL,
+       AFF_DEV_LOCAL,
+       AFF_IRQ_LOCAL
+};
+
+struct cpu_mask_set {
+       struct cpumask mask;
+       struct cpumask used;
+       uint gen;
+};
+
+struct hfi1_affinity {
+       struct cpu_mask_set def_intr;
+       struct cpu_mask_set rcv_intr;
+       struct cpu_mask_set proc;
+       struct cpumask real_cpu_mask;
+       /* spin lock to protect affinity struct */
+       spinlock_t lock;
+};
+
+struct hfi1_msix_entry;
+
+/* Initialize non-HT cpu cores mask */
+int init_real_cpu_mask(struct hfi1_devdata *);
+/* Initialize driver affinity data */
+void hfi1_dev_affinity_init(struct hfi1_devdata *);
+/* Free driver affinity data */
+void hfi1_dev_affinity_free(struct hfi1_devdata *);
+/*
+ * Set IRQ affinity to a CPU. The function will determine the
+ * CPU and set the affinity to it.
+ */
+int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Remove the IRQ's CPU affinity. This function also updates
+ * any internal CPU tracking data
+ */
+void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Determine a CPU affinity for a user process, if the process does not
+ * have an affinity set yet.
+ */
+int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
+/* Release a CPU used by a user process. */
+void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
+
+#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
new file mode 100644 (file)
index 0000000..0d58fe3
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _ASPM_H
+#define _ASPM_H
+
+#include "hfi.h"
+
+extern uint aspm_mode;
+
+enum aspm_mode {
+       ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */
+       ASPM_MODE_ENABLED = 1,  /* ASPM always enabled, power saving mode */
+       ASPM_MODE_DYNAMIC = 2,  /* ASPM enabled/disabled dynamically */
+};
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+       (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+       u32 up, dn;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return false;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+       dn = ASPM_L1_SUPPORTED(dn);
+
+       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+       up = ASPM_L1_SUPPORTED(up);
+
+       /* ASPM works on A-step but is reported as not supported */
+       return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+       u32 l1_ent_lat = 0x4u;
+       u32 reg32;
+
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return;
+
+       /* Enable ASPM L1 first in upstream component and then downstream */
+       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /* Disable ASPM L1 first in downstream component and then upstream */
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
+       if (parent)
+               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static inline void aspm_enable(struct hfi1_devdata *dd)
+{
+       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+           !dd->aspm_supported)
+               return;
+
+       aspm_hw_enable_l1(dd);
+       dd->aspm_enabled = true;
+}
+
+static inline void aspm_disable(struct hfi1_devdata *dd)
+{
+       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+               return;
+
+       aspm_hw_disable_l1(dd);
+       dd->aspm_enabled = false;
+}
+
+static inline void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       aspm_disable(dd);
+       atomic_inc(&dd->aspm_disabled_cnt);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static inline void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+               aspm_enable(dd);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+       bool restart_timer;
+       bool close_interrupts;
+       unsigned long flags;
+       ktime_t now, prev;
+
+       /* Quickest exit for minimum impact */
+       if (!rcd->aspm_intr_supported)
+               return;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       /* PSM contexts are open */
+       if (!rcd->aspm_intr_enable)
+               goto unlock;
+
+       prev = rcd->aspm_ts_last_intr;
+       now = ktime_get();
+       rcd->aspm_ts_last_intr = now;
+
+       /* An interrupt pair close together in time */
+       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+       /* Don't push out our timer till this much time has elapsed */
+       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+       restart_timer = restart_timer && close_interrupts;
+
+       /* Disable ASPM and schedule timer */
+       if (rcd->aspm_enabled && close_interrupts) {
+               aspm_disable_inc(rcd->dd);
+               rcd->aspm_enabled = false;
+               restart_timer = true;
+       }
+
+       if (restart_timer) {
+               mod_timer(&rcd->aspm_timer,
+                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+               rcd->aspm_ts_timer_sched = now;
+       }
+unlock:
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static inline void aspm_ctx_timer_function(unsigned long data)
+{
+       struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       aspm_enable_dec(rcd->dd);
+       rcd->aspm_enabled = true;
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Disable interrupt processing for verbs contexts when PSM contexts are open */
+static inline void aspm_disable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       unsigned i;
+
+       for (i = 0; i < dd->first_user_ctxt; i++) {
+               rcd = dd->rcd[i];
+               del_timer_sync(&rcd->aspm_timer);
+               spin_lock_irqsave(&rcd->aspm_lock, flags);
+               rcd->aspm_intr_enable = false;
+               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+       }
+
+       aspm_disable(dd);
+       atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+static inline void aspm_enable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       unsigned i;
+
+       aspm_enable(dd);
+
+       if (aspm_mode != ASPM_MODE_DYNAMIC)
+               return;
+
+       for (i = 0; i < dd->first_user_ctxt; i++) {
+               rcd = dd->rcd[i];
+               spin_lock_irqsave(&rcd->aspm_lock, flags);
+               rcd->aspm_intr_enable = true;
+               rcd->aspm_enabled = true;
+               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+       }
+}
+
+static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+       spin_lock_init(&rcd->aspm_lock);
+       setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function,
+                   (unsigned long)rcd);
+       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+               aspm_mode == ASPM_MODE_DYNAMIC &&
+               rcd->ctxt < rcd->dd->first_user_ctxt;
+}
+
+static inline void aspm_init(struct hfi1_devdata *dd)
+{
+       unsigned i;
+
+       spin_lock_init(&dd->aspm_lock);
+       dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+       for (i = 0; i < dd->first_user_ctxt; i++)
+               aspm_ctx_init(dd->rcd[i]);
+
+       /* Start with ASPM disabled */
+       aspm_hw_set_l1_ent_latency(dd);
+       dd->aspm_enabled = false;
+       aspm_hw_disable_l1(dd);
+
+       /* Now turn on ASPM if configured */
+       aspm_enable_all(dd);
+}
+
+static inline void aspm_exit(struct hfi1_devdata *dd)
+{
+       aspm_disable_all(dd);
+
+       /* Turn on ASPM on exit to conserve power */
+       aspm_enable(dd);
+}
+
+#endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
new file mode 100644 (file)
index 0000000..3b876da
--- /dev/null
@@ -0,0 +1,14712 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+#include "efivar.h"
+#include "platform.h"
+#include "aspm.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+       u64 flag;       /* the flag */
+       char *str;      /* description string */
+       u16 extra;      /* extra information */
+       u16 unused0;
+       u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED      0x1
+#define SEC_PACKET_DROPPED     0x2
+#define SEC_SC_HALTED          0x4     /* per-context only */
+#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
+
+#define MIN_KERNEL_KCTXTS         2
+#define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES                256
+#define NUM_MAP_REGS             32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+       num, \
+       sc0, sc0val, \
+       sc1, sc1val, \
+       sc2, sc2val, \
+       sc3, sc3val, \
+       sc4, sc4val, \
+       sc5, sc5val, \
+       sc6, sc6val, \
+       sc7, sc7val) \
+( \
+       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+       range, \
+       e0, e0val, \
+       e1, e1val, \
+       e2, e2val, \
+       e3, e3val, \
+       e4, e4val, \
+       e5, e5val, \
+       e6, e6val, \
+       e7, e7val, \
+       e8, e8val, \
+       e9, e9val, \
+       e10, e10val, \
+       e11, e11val, \
+       e12, e12val, \
+       e13, e13val, \
+       e14, e14val, \
+       e15, e15val) \
+( \
+       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+                       | CCE_STATUS_RXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+                       | CCE_STATUS_TXE_PAUSED_SMASK \
+                       | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+       SEC_WRITE_DROPPED,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
+#define PORT_DISCARD_EGRESS_ERRS \
+       (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
+       | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+               SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+               SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+               SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+               SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+               SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+               SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+               SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+               SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+               SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+               SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+               SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+               SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+               SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+               SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+               SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+               SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+       | SEES(TX_LAUNCH_CSR_PARITY) \
+       | SEES(TX_SBRD_CTL_CSR_PARITY) \
+       | SEES(TX_CONFIG_PARITY) \
+       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+       | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+               RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+               RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+               RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+               RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+               RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+               RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+               LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+                   D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+       FLAG_ENTRY0("Serdes internal loopback failure",
+                   FAILED_SERDES_INTERNAL_LOOPBACK),
+       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
+       FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+       FLAG_ENTRY0("Host request done", 0x0001),
+       FLAG_ENTRY0("BC SMA message", 0x0002),
+       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+       FLAG_ENTRY0("External device config request", 0x0020),
+       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+       FLAG_ENTRY0("LinkUp achieved", 0x0080),
+       FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+                                         u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+                          u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np);
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+       u32 status;             /* status CSR offset */
+       u32 clear;              /* clear CSR offset */
+       u32 mask;               /* mask CSR offset */
+       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+       const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+               handler, desc }
+#define DC_EE1(reg, handler, desc) \
+       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
+/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
+/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
+       /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+       /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+       /* the rest are reserved */
+};
+
+struct cntr_entry {
+       /*
+        * counter name
+        */
+       char *name;
+
+       /*
+        * csr to read for name (if applicable)
+        */
+       u64 csr;
+
+       /*
+        * offset into dd or ppd to store the counter's value
+        */
+       int offset;
+
+       /*
+        * flags
+        */
+       u8 flags;
+
+       /*
+        * accessor for stat element, context either dd or ppd
+        */
+       u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
+                      int mode, u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+       name, \
+       csr, \
+       offset, \
+       flags, \
+       accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+         (RCV_HDR_OVFL_CNT + ctx * 0x100), \
+         0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+         counter * 8 + SEND_COUNTER_ARRAY64, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+         0, \
+         0, \
+         CNTR_SYNTH, \
+         access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+       if (dd->flags & HFI1_PRESENT) {
+               return readq((void __iomem *)dd->kregbase + offset);
+       }
+       return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+       if (dd->flags & HFI1_PRESENT)
+               writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset)
+{
+       return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+                                int mode, u64 value)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = read_csr(dd, csr);
+       } else if (mode == CNTR_MODE_W) {
+               write_csr(dd, csr, value);
+               ret = value;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+       return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_SDMA) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 0x100 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       return read_write_csr(dd, csr, mode, data);
+}
+
+static u64 access_sde_err_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].err_cnt;
+       return 0;
+}
+
+static u64 access_sde_int_cnt(const struct cntr_entry *entry,
+                             void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].sdma_int_cnt;
+       return 0;
+}
+
+static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
+                                  void *context, int idx, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].idle_int_cnt;
+       return 0;
+}
+
+static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
+                                      void *context, int idx, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (dd->per_sdma && idx < dd->num_sdma)
+               return dd->per_sdma[idx].progress_int_cnt;
+       return 0;
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+                             int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+
+       val = read_write_csr(dd, csr, mode, data);
+       return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+                             int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+       u32 csr = entry->csr;
+       int ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       if (mode == CNTR_MODE_R)
+               ret = read_lcb_csr(dd, csr, &data);
+       else if (mode == CNTR_MODE_W)
+               ret = write_lcb_csr(dd, csr, data);
+
+       if (ret) {
+               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+       return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+       u64 val;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       val = read_write_csr(ppd->dd, csr, mode, data);
+       return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+                               u64 data)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = *cntr;
+       } else if (mode == CNTR_MODE_W) {
+               *cntr = data;
+               ret = data;
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+       return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+                                int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+                                int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+       u64 zero = 0;
+       u64 *counter;
+
+       if (vl == CNTR_INVALID_VL)
+               counter = &ppd->port_xmit_discards;
+       else if (vl >= 0 && vl < C_VL_COUNT)
+               counter = &ppd->port_xmit_discards_vl[vl];
+       else
+               counter = &zero;
+
+       return read_write_sw(ppd->dd, counter, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+                            mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+                            mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+       int cpu;
+       u64 counter = 0;
+
+       for_each_possible_cpu(cpu)
+               counter += *per_cpu_ptr(cntr, cpu);
+       return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+                         u64 __percpu *cntr,
+                         int vl, int mode, u64 data)
+{
+       u64 ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       if (mode == CNTR_MODE_R) {
+               ret = get_all_cpu_total(cntr) - *z_val;
+       } else if (mode == CNTR_MODE_W) {
+               /* A write can only zero the counter */
+               if (data == 0)
+                       *z_val = get_all_cpu_total(cntr);
+               else
+                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+               return 0;
+       }
+
+       return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+                             mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+                             mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_piodrain;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_kmem_wait;
+}
+
+static u64 access_sw_send_schedule(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
+                             mode, data);
+}
+
+/* Software counters for the error status bits within MISC_ERR_STATUS */
+static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[12];
+}
+
+static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[11];
+}
+
+static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[10];
+}
+
+static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[9];
+}
+
+static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[8];
+}
+
+static u64 access_misc_efuse_read_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[7];
+}
+
+static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[6];
+}
+
+static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[5];
+}
+
+static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[4];
+}
+
+static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[3];
+}
+
+static u64 access_misc_csr_write_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[2];
+}
+
+static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[1];
+}
+
+static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->misc_err_status_cnt[0];
+}
+
+/*
+ * Software counter for the aggregate of
+ * individual CceErrStatus counters
+ */
+static u64 access_sw_cce_err_status_aggregated_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_cce_err_status_aggregate;
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within CceErrStatus
+ */
+static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[40];
+}
+
+static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[39];
+}
+
+static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[38];
+}
+
+static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[37];
+}
+
+static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[36];
+}
+
+static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[35];
+}
+
+static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[34];
+}
+
+static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[33];
+}
+
+static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl, int mode,
+                                               u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[32];
+}
+
+static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
+                                  void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[31];
+}
+
+static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[30];
+}
+
+static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[29];
+}
+
+static u64 access_pcic_transmit_back_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[28];
+}
+
+static u64 access_pcic_transmit_front_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[27];
+}
+
+static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[26];
+}
+
+static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[25];
+}
+
+static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[24];
+}
+
+static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[23];
+}
+
+static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[22];
+}
+
+static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[21];
+}
+
+static u64 access_pcic_n_post_dat_q_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[20];
+}
+
+static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[19];
+}
+
+static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[18];
+}
+
+static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[17];
+}
+
+static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[16];
+}
+
+static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[15];
+}
+
+static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[14];
+}
+
+static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[13];
+}
+
+static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[12];
+}
+
+static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[11];
+}
+
+static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[10];
+}
+
+static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[9];
+}
+
+static u64 access_cce_cli2_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[8];
+}
+
+static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[7];
+}
+
+static u64 access_cce_cli0_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[6];
+}
+
+static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[5];
+}
+
+static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[4];
+}
+
+static u64 access_cce_trgt_async_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[3];
+}
+
+static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[2];
+}
+
+static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[1];
+}
+
+static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->cce_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within RcvErrStatus
+ */
+static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[63];
+}
+
+static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[62];
+}
+
+static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[61];
+}
+
+static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[60];
+}
+
+static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[59];
+}
+
+static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[58];
+}
+
+static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[57];
+}
+
+static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[56];
+}
+
+static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[55];
+}
+
+static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[54];
+}
+
+static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[53];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[52];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[51];
+}
+
+static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[50];
+}
+
+static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[49];
+}
+
+static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[48];
+}
+
+static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[47];
+}
+
+static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[46];
+}
+
+static u64 access_rx_hq_intr_csr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[45];
+}
+
+static u64 access_rx_lookup_csr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[44];
+}
+
+static u64 access_rx_lookup_rcv_array_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[43];
+}
+
+static u64 access_rx_lookup_rcv_array_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[42];
+}
+
+static u64 access_rx_lookup_des_part2_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[41];
+}
+
+static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[40];
+}
+
+static u64 access_rx_lookup_des_part1_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[39];
+}
+
+static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[38];
+}
+
+static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[37];
+}
+
+static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[36];
+}
+
+static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[35];
+}
+
+static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[34];
+}
+
+static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[33];
+}
+
+static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[32];
+}
+
+static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[31];
+}
+
+static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[30];
+}
+
+static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[29];
+}
+
+static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[28];
+}
+
+static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[27];
+}
+
+static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[26];
+}
+
+static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[25];
+}
+
+static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[24];
+}
+
+static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[23];
+}
+
+static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[22];
+}
+
+static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[21];
+}
+
+static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[20];
+}
+
+static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[19];
+}
+
+static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[18];
+}
+
+static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[17];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[16];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[15];
+}
+
+static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[14];
+}
+
+static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[13];
+}
+
+static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[12];
+}
+
+static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[11];
+}
+
+static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[10];
+}
+
+static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[9];
+}
+
+static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[8];
+}
+
+static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[7];
+}
+
+static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[6];
+}
+
+static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[5];
+}
+
+static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[4];
+}
+
+static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[3];
+}
+
+static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[2];
+}
+
+static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[1];
+}
+
+static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->rcv_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendPioErrStatus
+ */
+static u64 access_pio_pec_sop_head_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[35];
+}
+
+static u64 access_pio_pcc_sop_head_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[34];
+}
+
+static u64 access_pio_last_returned_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[33];
+}
+
+static u64 access_pio_current_free_cnt_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[32];
+}
+
+static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[31];
+}
+
+static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[30];
+}
+
+static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[29];
+}
+
+static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[28];
+}
+
+static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[27];
+}
+
+static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[26];
+}
+
+static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[25];
+}
+
+static u64 access_pio_block_qw_count_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[24];
+}
+
+static u64 access_pio_write_qw_valid_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[23];
+}
+
+static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[22];
+}
+
+static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[21];
+}
+
+static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[20];
+}
+
+static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[19];
+}
+
+static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[18];
+}
+
+static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[17];
+}
+
+static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[16];
+}
+
+static u64 access_pio_credit_ret_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[15];
+}
+
+static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[14];
+}
+
+static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[13];
+}
+
+static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[12];
+}
+
+static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[11];
+}
+
+static u64 access_pio_sm_pkt_reset_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[10];
+}
+
+static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[9];
+}
+
+static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[8];
+}
+
+static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[7];
+}
+
+static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[6];
+}
+
+static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[5];
+}
+
+static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[4];
+}
+
+static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[3];
+}
+
+static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                        void *context, int vl, int mode,
+                                        u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[2];
+}
+
+static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[1];
+}
+
+static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_pio_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaErrStatus
+ */
+static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[3];
+}
+
+static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[2];
+}
+
+static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[1];
+}
+
+static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_dma_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendEgressErrStatus
+ */
+static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[63];
+}
+
+static u64 access_tx_read_sdma_memory_csr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[62];
+}
+
+static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[61];
+}
+
+static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[60];
+}
+
+static u64 access_tx_read_sdma_memory_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[59];
+}
+
+static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[58];
+}
+
+static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[57];
+}
+
+static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[56];
+}
+
+static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[55];
+}
+
+static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[54];
+}
+
+static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[53];
+}
+
+static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[52];
+}
+
+static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[51];
+}
+
+static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[50];
+}
+
+static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[49];
+}
+
+static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[48];
+}
+
+static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[47];
+}
+
+static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[46];
+}
+
+static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[45];
+}
+
+static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[44];
+}
+
+static u64 access_tx_read_sdma_memory_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[43];
+}
+
+static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[42];
+}
+
+static u64 access_tx_credit_return_partiy_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[41];
+}
+
+static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[40];
+}
+
+static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[39];
+}
+
+static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[38];
+}
+
+static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[37];
+}
+
+static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[36];
+}
+
+static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[35];
+}
+
+static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[34];
+}
+
+static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[33];
+}
+
+static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[32];
+}
+
+static u64 access_tx_sdma15_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[31];
+}
+
+static u64 access_tx_sdma14_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[30];
+}
+
+static u64 access_tx_sdma13_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[29];
+}
+
+static u64 access_tx_sdma12_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[28];
+}
+
+static u64 access_tx_sdma11_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[27];
+}
+
+static u64 access_tx_sdma10_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[26];
+}
+
+static u64 access_tx_sdma9_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[25];
+}
+
+static u64 access_tx_sdma8_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[24];
+}
+
+static u64 access_tx_sdma7_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[23];
+}
+
+static u64 access_tx_sdma6_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[22];
+}
+
+static u64 access_tx_sdma5_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[21];
+}
+
+static u64 access_tx_sdma4_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[20];
+}
+
+static u64 access_tx_sdma3_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[19];
+}
+
+static u64 access_tx_sdma2_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[18];
+}
+
+static u64 access_tx_sdma1_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[17];
+}
+
+static u64 access_tx_sdma0_disallowed_packet_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[16];
+}
+
+static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
+                                          void *context, int vl, int mode,
+                                          u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[15];
+}
+
+static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[14];
+}
+
+static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[13];
+}
+
+static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[12];
+}
+
+static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[11];
+}
+
+static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[10];
+}
+
+static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[9];
+}
+
+static u64 access_tx_sdma_launch_intf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[8];
+}
+
+static u64 access_tx_pio_launch_intf_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[7];
+}
+
+static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[6];
+}
+
+static u64 access_tx_incorrect_link_state_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[5];
+}
+
+static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode,
+                                     u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[4];
+}
+
+static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[3];
+}
+
+static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[2];
+}
+
+static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[1];
+}
+
+static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_egress_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendErrStatus
+ */
+static u64 access_send_csr_write_bad_addr_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[2];
+}
+
+static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+                                                void *context, int vl,
+                                                int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[1];
+}
+
+static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
+                                     void *context, int vl, int mode,
+                                     u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->send_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendCtxtErrStatus
+ */
+static u64 access_pio_write_out_of_bounds_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[4];
+}
+
+static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[3];
+}
+
+static u64 access_pio_write_crosses_boundary_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[2];
+}
+
+static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
+                                               void *context, int vl,
+                                               int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[1];
+}
+
+static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl, int mode,
+                                              u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_ctxt_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaEngErrStatus
+ */
+static u64 access_sdma_header_request_fifo_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[23];
+}
+
+static u64 access_sdma_header_storage_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[22];
+}
+
+static u64 access_sdma_packet_tracking_cor_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[21];
+}
+
+static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[20];
+}
+
+static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[19];
+}
+
+static u64 access_sdma_header_request_fifo_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[18];
+}
+
+static u64 access_sdma_header_storage_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[17];
+}
+
+static u64 access_sdma_packet_tracking_unc_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[16];
+}
+
+static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[15];
+}
+
+static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[14];
+}
+
+static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
+                                      void *context, int vl, int mode,
+                                      u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[13];
+}
+
+static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[12];
+}
+
+static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
+                                             void *context, int vl, int mode,
+                                             u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[11];
+}
+
+static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
+                                            void *context, int vl, int mode,
+                                            u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[10];
+}
+
+static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[9];
+}
+
+static u64 access_sdma_packet_desc_overflow_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[8];
+}
+
+static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
+                                              void *context, int vl,
+                                              int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[7];
+}
+
+static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
+                                   void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[6];
+}
+
+static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[5];
+}
+
+static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
+                                         void *context, int vl, int mode,
+                                         u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[4];
+}
+
+static u64 access_sdma_tail_out_of_bounds_err_cnt(
+                               const struct cntr_entry *entry,
+                               void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[3];
+}
+
+static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[2];
+}
+
+static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
+                                           void *context, int vl, int mode,
+                                           u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[1];
+}
+
+static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
+                                       void *context, int vl, int mode,
+                                       u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->sw_send_dma_eng_err_status_cnt[0];
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
+                             void *context, int vl, int mode, u64 data)      \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+       return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,       \
+                             ppd->ibport_data.rvp.cntr, vl,                  \
+                             mode, data);                                    \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
+                               void *context, int vl, int mode, u64 data)    \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+                                                                             \
+       if (vl != CNTR_INVALID_VL)                                            \
+               return 0;                                                     \
+                                                                             \
+       return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,        \
+                            mode, data);                                     \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
+                       CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+                       CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+                                CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+                                 CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+                              CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+                             CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+                              CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+                                CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+                               CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+                               CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+                              CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+                               CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+                               CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+                                  CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+                        CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+                                 CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+                        CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+                        CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+                        CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+                        CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+                        CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+                           access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_wait),
+[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_drain),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+                           access_sw_kmem_wait),
+[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
+                           access_sw_send_schedule),
+[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
+                                     SEND_DMA_DESC_FETCHED_CNT, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     dev_access_u32_csr),
+[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_int_cnt),
+[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
+                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                            access_sde_err_cnt),
+[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
+                                 CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                 access_sde_idle_int_cnt),
+[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
+                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+                                     access_sde_progress_int_cnt),
+/* MISC_ERR_STATUS */
+[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_pll_lock_fail_err_cnt),
+[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_mbist_fail_err_cnt),
+[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_invalid_eep_cmd_err_cnt),
+[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_done_parity_err_cnt),
+[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_write_err_cnt),
+[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
+                               0, CNTR_NORMAL,
+                               access_misc_efuse_read_bad_addr_err_cnt),
+[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_efuse_csr_parity_err_cnt),
+[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_fw_auth_failed_err_cnt),
+[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_key_mismatch_err_cnt),
+[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_sbus_write_failed_err_cnt),
+[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_write_bad_addr_err_cnt),
+[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_read_bad_addr_err_cnt),
+[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
+                               CNTR_NORMAL,
+                               access_misc_csr_parity_err_cnt),
+/* CceErrStatus */
+[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
+                               CNTR_NORMAL,
+                               access_sw_cce_err_status_aggregated_cnt),
+[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_csr_parity_err_cnt),
+[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_int_map_unc_err_cnt),
+[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_int_map_cor_err_cnt),
+[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_table_unc_err_cnt),
+[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_msix_table_cor_err_cnt),
+[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_cce_rxdma_conv_fifo_parity_err_cnt),
+[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_cce_rcpl_async_fifo_parity_err_cnt),
+[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_seg_write_bad_addr_err_cnt),
+[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_seg_read_bad_addr_err_cnt),
+[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
+                               CNTR_NORMAL,
+                               access_la_triggered_cnt),
+[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_trgt_cpl_timeout_err_cnt),
+[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_receive_parity_err_cnt),
+[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_transmit_back_parity_err_cnt),
+[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
+                               0, CNTR_NORMAL,
+                               access_pcic_transmit_front_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_dat_q_unc_err_cnt),
+[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_hd_q_unc_err_cnt),
+[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_dat_q_unc_err_cnt),
+[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_hd_q_unc_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_sot_mem_unc_err_cnt),
+[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_mem_unc_err),
+[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_n_post_dat_q_parity_err_cnt),
+[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_n_post_h_q_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_dat_q_cor_err_cnt),
+[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_cpl_hd_q_cor_err_cnt),
+[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_dat_q_cor_err_cnt),
+[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_post_hd_q_cor_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_sot_mem_cor_err_cnt),
+[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
+                               CNTR_NORMAL,
+                               access_pcic_retry_mem_cor_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
+                               "CceCli1AsyncFifoDbgParityError", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_cli1_async_fifo_dbg_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
+                               "CceCli1AsyncFifoRxdmaParityError", 0, 0,
+                               CNTR_NORMAL,
+                               access_cce_cli1_async_fifo_rxdma_parity_err_cnt
+                               ),
+[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
+                       "CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
+                       "CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
+[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_cli2_async_fifo_parity_err_cnt),
+[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_cfg_bus_parity_err_cnt),
+[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_cli0_async_fifo_parity_err_cnt),
+[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_rspd_data_parity_err_cnt),
+[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_trgt_access_err_cnt),
+[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_cce_trgt_async_fifo_parity_err_cnt),
+[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_write_bad_addr_err_cnt),
+[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_cce_csr_read_bad_addr_err_cnt),
+[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_ccs_csr_parity_err_cnt),
+
+/* RcvErrStatus */
+[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_parity_err_cnt),
+[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_write_bad_addr_err_cnt),
+[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_csr_read_bad_addr_err_cnt),
+[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_unc_err_cnt),
+[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_dq_fsm_encoding_err_cnt),
+[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_eq_fsm_encoding_err_cnt),
+[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_parity_err_cnt),
+[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_data_cor_err_cnt),
+[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_data_unc_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_data_fifo_rd_cor_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_data_fifo_rd_unc_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_hdr_fifo_rd_cor_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_hdr_fifo_rd_unc_err_cnt),
+[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part2_cor_err_cnt),
+[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part2_unc_err_cnt),
+[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part1_cor_err_cnt),
+[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_desc_part1_unc_err_cnt),
+[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_hq_intr_fsm_err_cnt),
+[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_hq_intr_csr_parity_err_cnt),
+[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_csr_parity_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_rcv_array_cor_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_rcv_array_unc_err_cnt),
+[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_lookup_des_part2_parity_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_lookup_des_part1_unc_cor_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_lookup_des_part1_unc_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_next_free_buf_cor_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_next_free_buf_unc_err_cnt),
+[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufFlInitWrAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_fl_init_wr_addr_parity_err_cnt),
+[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_fl_initdone_parity_err_cnt),
+[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_fl_write_addr_parity_err_cnt),
+[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_fl_rd_addr_parity_err_cnt),
+[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_empty_err_cnt),
+[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_full_err_cnt),
+[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_bad_lookup_err_cnt),
+[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_ctx_id_parity_err_cnt),
+[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rbuf_csr_qeopdw_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQNumOfPktParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQTlPtrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
+                       "RxRbufCsrQHeadBufNumParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_block_list_read_cor_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_rx_rbuf_block_list_read_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
+                       "RxRbufLookupDesRegUncCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_lookup_des_reg_unc_err_cnt),
+[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_free_list_cor_err_cnt),
+[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rbuf_free_list_unc_err_cnt),
+[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_fsm_encoding_err_cnt),
+[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_flag_cor_err_cnt),
+[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_flag_unc_err_cnt),
+[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dc_sop_eop_parity_err_cnt),
+[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_csr_parity_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_qp_map_table_cor_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_qp_map_table_unc_err_cnt),
+[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_data_cor_err_cnt),
+[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_data_unc_err_cnt),
+[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_hdr_cor_err_cnt),
+[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_rcv_hdr_unc_err_cnt),
+[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dc_intf_parity_err_cnt),
+[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_rx_dma_csr_cor_err_cnt),
+/* SendPioErrStatus */
+[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pec_sop_head_parity_err_cnt),
+[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pcc_sop_head_parity_err_cnt),
+[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_pio_last_returned_cnt_parity_err_cnt),
+[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_pio_current_free_cnt_parity_err_cnt),
+[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_reserved_31_err_cnt),
+[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_reserved_30_err_cnt),
+[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_sop_len_err_cnt),
+[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_bqc_mem_parity_err_cnt),
+[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vl_fifo_parity_err_cnt),
+[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vlf_sop_parity_err_cnt),
+[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_vlf_v1_len_parity_err_cnt),
+[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_block_qw_count_parity_err_cnt),
+[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_qw_valid_parity_err_cnt),
+[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_state_machine_err_cnt),
+[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_data_parity_err_cnt),
+[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_host_addr_mem_cor_err_cnt),
+[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_host_addr_mem_unc_err_cnt),
+[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
+[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_init_sm_in_err_cnt),
+[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_ppmc_pbl_fifo_err_cnt),
+[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
+                       0, CNTR_NORMAL,
+                       access_pio_credit_ret_fifo_parity_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank1_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank0_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank1_unc_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_v1_len_mem_bank0_unc_err_cnt),
+[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sm_pkt_reset_parity_err_cnt),
+[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pkt_evict_fifo_parity_err_cnt),
+[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
+                       "PioSbrdctrlCrrelFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
+[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sbrdctl_crrel_parity_err_cnt),
+[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pec_fifo_parity_err_cnt),
+[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_pcc_fifo_parity_err_cnt),
+[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sb_mem_fifo1_err_cnt),
+[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_sb_mem_fifo0_err_cnt),
+[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_csr_parity_err_cnt),
+[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_addr_parity_err_cnt),
+[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_bad_ctxt_err_cnt),
+/* SendDmaErrStatus */
+[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
+                       0, CNTR_NORMAL,
+                       access_sdma_pcie_req_tracking_cor_err_cnt),
+[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_sdma_pcie_req_tracking_unc_err_cnt),
+[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_csr_parity_err_cnt),
+[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_rpy_tag_err_cnt),
+/* SendEgressErrStatus */
+[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_csr_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
+                       0, CNTR_NORMAL,
+                       access_tx_read_sdma_memory_csr_err_cnt),
+[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifo_cor_err_cnt),
+[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_cor_err_cnt),
+[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_sdma_memory_cor_err_cnt),
+[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sb_hdr_cor_err_cnt),
+[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_overrun_err_cnt),
+[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo8_cor_err_cnt),
+[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo7_cor_err_cnt),
+[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo6_cor_err_cnt),
+[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo5_cor_err_cnt),
+[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo4_cor_err_cnt),
+[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo3_cor_err_cnt),
+[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo2_cor_err_cnt),
+[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo1_cor_err_cnt),
+[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_fifo0_cor_err_cnt),
+[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_return_vl_err_cnt),
+[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_hcrc_insertion_err_cnt),
+[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifo_unc_err_cnt),
+[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_pio_memory_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_read_sdma_memory_unc_err_cnt),
+[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sb_hdr_unc_err_cnt),
+[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_credit_return_partiy_err_cnt),
+[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo8_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo7_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo6_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo5_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo4_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo3_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo2_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo1_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_launch_fifo0_unc_or_parity_err_cnt),
+[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma15_disallowed_packet_err_cnt),
+[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma14_disallowed_packet_err_cnt),
+[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma13_disallowed_packet_err_cnt),
+[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma12_disallowed_packet_err_cnt),
+[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma11_disallowed_packet_err_cnt),
+[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma10_disallowed_packet_err_cnt),
+[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma9_disallowed_packet_err_cnt),
+[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma8_disallowed_packet_err_cnt),
+[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma7_disallowed_packet_err_cnt),
+[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma6_disallowed_packet_err_cnt),
+[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma5_disallowed_packet_err_cnt),
+[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma4_disallowed_packet_err_cnt),
+[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma3_disallowed_packet_err_cnt),
+[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma2_disallowed_packet_err_cnt),
+[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma1_disallowed_packet_err_cnt),
+[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma0_disallowed_packet_err_cnt),
+[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_config_parity_err_cnt),
+[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sbrd_ctl_csr_parity_err_cnt),
+[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_launch_csr_parity_err_cnt),
+[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_illegal_vl_err_cnt),
+[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
+                       "TxSbrdCtlStateMachineParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_sbrd_ctl_state_machine_parity_err_cnt),
+[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_10_err_cnt),
+[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_9_err_cnt),
+[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
+                       0, 0, CNTR_NORMAL,
+                       access_tx_sdma_launch_intf_parity_err_cnt),
+[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pio_launch_intf_parity_err_cnt),
+[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_6_err_cnt),
+[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_incorrect_link_state_err_cnt),
+[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_linkdown_err_cnt),
+[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
+                       "EgressFifoUnderrunOrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_egress_fifi_underrun_or_parity_err_cnt),
+[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
+                       CNTR_NORMAL,
+                       access_egress_reserved_2_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pkt_integrity_mem_unc_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_tx_pkt_integrity_mem_cor_err_cnt),
+/* SendErrStatus */
+[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_write_bad_addr_err_cnt),
+[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_read_bad_addr_err_cnt),
+[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_send_csr_parity_cnt),
+/* SendCtxtErrStatus */
+[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_out_of_bounds_err_cnt),
+[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_write_overflow_err_cnt),
+[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
+                       0, 0, CNTR_NORMAL,
+                       access_pio_write_crosses_boundary_err_cnt),
+[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_disallowed_packet_err_cnt),
+[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_pio_inconsistent_sop_err_cnt),
+/* SendDmaEngErrStatus */
+[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
+                       0, 0, CNTR_NORMAL,
+                       access_sdma_header_request_fifo_cor_err_cnt),
+[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_storage_cor_err_cnt),
+[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_tracking_cor_err_cnt),
+[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_assembly_cor_err_cnt),
+[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_desc_table_cor_err_cnt),
+[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
+                       0, 0, CNTR_NORMAL,
+                       access_sdma_header_request_fifo_unc_err_cnt),
+[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_storage_unc_err_cnt),
+[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_tracking_unc_err_cnt),
+[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_assembly_unc_err_cnt),
+[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_desc_table_unc_err_cnt),
+[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_timeout_err_cnt),
+[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_length_err_cnt),
+[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_address_err_cnt),
+[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_header_select_err_cnt),
+[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_reserved_9_err_cnt),
+[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_packet_desc_overflow_err_cnt),
+[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_length_mismatch_err_cnt),
+[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_halt_err_cnt),
+[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_mem_read_err_cnt),
+[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_first_desc_err_cnt),
+[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_tail_out_of_bounds_err_cnt),
+[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_too_long_err_cnt),
+[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_gen_mismatch_err_cnt),
+[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
+                       CNTR_NORMAL,
+                       access_sdma_wrong_dw_err_cnt),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+                       CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+                       CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+                       CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+                                     CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+                                    CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+                                     CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                            access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                          access_sw_link_up_cnt),
+[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
+                                access_sw_unknown_frame_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                            access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+                               CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+                               access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+                                access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+                               access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+                               access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+                                      access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xF0) == 0x10;
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+       char *p = *curp;
+       int len = *lenp;
+       int result = 0; /* success */
+       char c;
+
+       /* add a comma, if first in the buffer */
+       if (p != buf) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = ',';
+               len--;
+       }
+
+       /* copy the string */
+       while ((c = *s++) != 0) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = c;
+               len--;
+       }
+
+done:
+       /* write return values */
+       *curp = p;
+       *lenp = len;
+
+       return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+                        struct flag_table *table, int table_size)
+{
+       char extra[32];
+       char *p = buf;
+       int len = buf_len;
+       int no_room = 0;
+       int i;
+
+       /* make sure there is at least 2 so we can form "*" */
+       if (len < 2)
+               return "";
+
+       len--;  /* leave room for a nul */
+       for (i = 0; i < table_size; i++) {
+               if (flags & table[i].flag) {
+                       no_room = append_str(buf, &p, &len, table[i].str);
+                       if (no_room)
+                               break;
+                       flags &= ~table[i].flag;
+               }
+       }
+
+       /* any undocumented bits left? */
+       if (!no_room && flags) {
+               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+               no_room = append_str(buf, &p, &len, extra);
+       }
+
+       /* add * if ran out of room */
+       if (no_room) {
+               /* may need to back up to add space for a '*' */
+               if (len == 0)
+                       --p;
+               *p++ = '*';
+       }
+
+       /* add final nul - space already allocated above */
+       *p = 0;
+       return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+       "CceErrInt",            /* 0 */
+       "RxeErrInt",            /* 1 */
+       "MiscErrInt",           /* 2 */
+       "Reserved3",            /* 3 */
+       "PioErrInt",            /* 4 */
+       "SDmaErrInt",           /* 5 */
+       "EgressErrInt",         /* 6 */
+       "TxeErrInt"             /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(cce_misc_names))
+               strncpy(buf, cce_misc_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u",
+                        source + IS_GENERAL_ERR_START);
+
+       return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+       return buf;
+}
+
+static const char * const various_names[] = {
+       "PbcInt",
+       "GpioAssertInt",
+       "Qsfp1Int",
+       "Qsfp2Int",
+       "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(various_names))
+               strncpy(buf, various_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
+       return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+       static const char * const dc_int_names[] = {
+               "common",
+               "lcb",
+               "8051",
+               "lbm"   /* local block merge */
+       };
+
+       if (source < ARRAY_SIZE(dc_int_names))
+               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+       else
+               snprintf(buf, bsize, "DCInt%u", source);
+       return buf;
+}
+
+static const char * const sdma_int_names[] = {
+       "SDmaInt",
+       "SdmaIdleInt",
+       "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+       if (likely(what < 3))
+               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+       else
+               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+       return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvAvailInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvUrgentInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCreditInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+       return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          cce_err_status_flags,
+                          ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          rxe_err_status_flags,
+                          ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, misc_err_status_flags,
+                          ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          pio_err_status_flags,
+                          ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          sdma_err_status_flags,
+                          ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          egress_err_status_flags,
+                          ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          egress_err_info_flags,
+                          ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          send_err_status_flags,
+                          ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       /*
+        * For most these errors, there is nothing that can be done except
+        * report or record it.
+        */
+       dd_dev_info(dd, "CCE Error: %s\n",
+                   cce_err_status_string(buf, sizeof(buf), reg));
+
+       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
+           is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+               /* this error requires a manual drop into SPC freeze mode */
+               /* then a fix up */
+               start_freeze_handling(dd->pport, FREEZE_SELF);
+       }
+
+       for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i)) {
+                       incr_cntr64(&dd->cce_err_status_cnt[i]);
+                       /* maintain a counter over all cce_err_status errors */
+                       incr_cntr64(&dd->sw_cce_err_status_aggregate);
+               }
+       }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+       struct hfi1_pportdata *ppd = dd->pport;
+       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+           ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(
+               ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+               OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+       dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
+
+       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+       setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
+       /* Assume the hardware counter has been reset */
+       dd->rcv_ovfl_cnt = 0;
+       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+       if (dd->rcverr_timer.data)
+               del_timer_sync(&dd->rcverr_timer);
+       dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Receive Error: %s\n",
+                   rxe_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_RXE_FREEZE_ERR) {
+               int flags = 0;
+
+               /*
+                * Freeze mode recovery is disabled for the errors
+                * in RXE_FREEZE_ABORT_MASK
+                */
+               if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+                       flags = FREEZE_ABORT;
+
+               start_freeze_handling(dd->pport, flags);
+       }
+
+       for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->rcv_err_status_cnt[i]);
+       }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Misc Error: %s",
+                   misc_err_status_string(buf, sizeof(buf), reg));
+       for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->misc_err_status_cnt[i]);
+       }
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "PIO Error: %s\n",
+                   pio_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_PIO_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+
+       for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_pio_err_status_cnt[i]);
+       }
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "SDMA Error: %s\n",
+                   sdma_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_SDMA_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+
+       for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_dma_err_status_cnt[i]);
+       }
+}
+
+static inline void __count_port_discards(struct hfi1_pportdata *ppd)
+{
+       incr_cntr64(&ppd->port_xmit_discards);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+       __count_port_discards(dd->pport);
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd,
+                                       int vl)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+       char buf[96];
+
+       /* clear down all observed info as quickly as possible after read */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+       dd_dev_info(dd,
+                   "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+                   info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+       /* Eventually add other counters for each bit */
+       if (info & PORT_DISCARD_EGRESS_ERRS) {
+               int weight, i;
+
+               /*
+                * Count all applicable bits as individual errors and
+                * attribute them to the packet that triggered this handler.
+                * This may not be completely accurate due to limitations
+                * on the available hardware error information.  There is
+                * a single information register and any number of error
+                * packets may have occurred and contributed to it before
+                * this routine is called.  This means that:
+                * a) If multiple packets with the same error occur before
+                *    this routine is called, earlier packets are missed.
+                *    There is only a single bit for each error type.
+                * b) Errors may not be attributed to the correct VL.
+                *    The driver is attributing all bits in the info register
+                *    to the packet that triggered this call, but bits
+                *    could be an accumulation of different packets with
+                *    different VLs.
+                * c) A single error packet may have multiple counts attached
+                *    to it.  There is no way for the driver to know if
+                *    multiple bits set in the info register are due to a
+                *    single packet or multiple packets.  The driver assumes
+                *    multiple packets.
+                */
+               weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
+               for (i = 0; i < weight; i++) {
+                       __count_port_discards(ppd);
+                       if (vl >= 0 && vl < TXE_NUM_DATA_VL)
+                               incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
+                       else if (vl == 15)
+                               incr_cntr64(&ppd->port_xmit_discards_vl
+                                           [C_VL_15]);
+               }
+       }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+       return (posn >= SEES(TX_LINKDOWN) &&
+               posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(int posn)
+{
+       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+/*
+ * Input value is a bit position of one of the SDMA engine disallowed
+ * packet errors.  Return which engine.  Use of this must be guarded by
+ * disallowed_pkt_err().
+ */
+static inline int disallowed_pkt_engine(int posn)
+{
+       return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
+}
+
+/*
+ * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
+ * be done.
+ */
+static int engine_to_vl(struct hfi1_devdata *dd, int engine)
+{
+       struct sdma_vl_map *m;
+       int vl;
+
+       /* range check */
+       if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
+               return -1;
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       vl = m->engine_to_vl[engine];
+       rcu_read_unlock();
+
+       return vl;
+}
+
+/*
+ * Translate the send context (sofware index) into a VL.  Return -1 if the
+ * translation cannot be done.
+ */
+static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       int i;
+
+       sci = &dd->send_contexts[sw_index];
+
+       /* there is no information for user (PSM) and ack contexts */
+       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
+               return -1;
+
+       sc = sci->sc;
+       if (!sc)
+               return -1;
+       if (dd->vld[15].sc == sc)
+               return 15;
+       for (i = 0; i < num_vls; i++)
+               if (dd->vld[i].sc == sc)
+                       return i;
+
+       return -1;
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 reg_copy = reg, handled = 0;
+       char buf[96];
+       int i = 0;
+
+       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+       else if (is_ax(dd) &&
+                (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
+                (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+               start_freeze_handling(dd->pport, 0);
+
+       while (reg_copy) {
+               int posn = fls64(reg_copy);
+               /* fls64() returns a 1-based offset, we want it zero based */
+               int shift = posn - 1;
+               u64 mask = 1ULL << shift;
+
+               if (port_inactive_err(shift)) {
+                       count_port_inactive(dd);
+                       handled |= mask;
+               } else if (disallowed_pkt_err(shift)) {
+                       int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
+
+                       handle_send_egress_err_info(dd, vl);
+                       handled |= mask;
+               }
+               reg_copy &= ~mask;
+       }
+
+       reg &= ~handled;
+
+       if (reg)
+               dd_dev_info(dd, "Egress Error: %s\n",
+                           egress_err_status_string(buf, sizeof(buf), reg));
+
+       for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_egress_err_status_cnt[i]);
+       }
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+       int i = 0;
+
+       dd_dev_info(dd, "Send Error: %s\n",
+                   send_err_status_string(buf, sizeof(buf), reg));
+
+       for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
+               if (reg & (1ull << i))
+                       incr_cntr64(&dd->send_err_status_cnt[i]);
+       }
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+                                u32 context,
+                                const struct err_reg_info *eri)
+{
+       u64 reg;
+       u32 count;
+
+       /* read in a loop until no more errors are seen */
+       count = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, context, eri->status);
+               if (reg == 0)
+                       break;
+               write_kctxt_csr(dd, context, eri->clear, reg);
+               if (likely(eri->handler))
+                       eri->handler(dd, context, reg);
+               count++;
+               if (count > MAX_CLEAR_COUNT) {
+                       u64 mask;
+
+                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+                                  eri->desc, reg);
+                       /*
+                        * Read-modify-write so any other masked bits
+                        * remain masked.
+                        */
+                       mask = read_kctxt_csr(dd, context, eri->mask);
+                       mask &= ~reg;
+                       write_kctxt_csr(dd, context, eri->mask, mask);
+                       break;
+               }
+       }
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &misc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else {
+               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+                          source);
+       }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                          sc_err_status_flags,
+                          ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+                               unsigned int hw_context)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       char flags[96];
+       u64 status;
+       u32 sw_index;
+       int i = 0;
+
+       sw_index = dd->hw_to_sw[hw_context];
+       if (sw_index >= dd->num_send_contexts) {
+               dd_dev_err(dd,
+                          "out of range sw index %u for send context %u\n",
+                          sw_index, hw_context);
+               return;
+       }
+       sci = &dd->send_contexts[sw_index];
+       sc = sci->sc;
+       if (!sc) {
+               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+                          sw_index, hw_context);
+               return;
+       }
+
+       /* tell the software that a halt has begun */
+       sc_stop(sc, SCF_HALTED);
+
+       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+                   send_context_err_status_string(flags, sizeof(flags),
+                                                  status));
+
+       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+               handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
+
+       /*
+        * Automatically restart halted kernel contexts out of interrupt
+        * context.  User contexts must ask the driver to restart the context.
+        */
+       if (sc->type != SC_USER)
+               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+
+       /*
+        * Update the counters for the corresponding status bits.
+        * Note that these particular counters are aggregated over all
+        * 160 contexts.
+        */
+       for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
+               if (status & (1ull << i))
+                       incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
+       }
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int source, u64 status)
+{
+       struct sdma_engine *sde;
+       int i = 0;
+
+       sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+                  sde->this_idx, source, (unsigned long long)status);
+#endif
+       sde->err_cnt++;
+       sdma_engine_error(sde, status);
+
+       /*
+       * Update the counters for the corresponding status bits.
+       * Note that these particular counters are aggregated over
+       * all 16 DMA engines.
+       */
+       for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
+               if (status & (1ull << i))
+                       incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
+       }
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct sdma_engine *sde = &dd->per_sdma[source];
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+                  source);
+       sdma_dumpstate(sde);
+#endif
+       interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &various_err[source];
+
+       /*
+        * TCritInt cannot go through interrupt_clear_down()
+        * because it is not a second tier interrupt. The handler
+        * should be called directly.
+        */
+       if (source == TCRIT_INT_SOURCE)
+               handle_temp_err(dd);
+       else if (eri->handler)
+               interrupt_clear_down(dd, 0, eri);
+       else
+               dd_dev_info(dd,
+                           "%s: Unimplemented/reserved interrupt %d\n",
+                           __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+       /* src_ctx is always zero */
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned long flags;
+       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+       if (reg & QSFP_HFI0_MODPRST_N) {
+               if (!qsfp_mod_present(ppd)) {
+                       dd_dev_info(dd, "%s: QSFP module removed\n",
+                                   __func__);
+
+                       ppd->driver_link_ready = 0;
+                       /*
+                        * Cable removed, reset all our information about the
+                        * cache and cable capabilities
+                        */
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       /*
+                        * We don't set cache_refresh_required here as we expect
+                        * an interrupt when a cable is inserted
+                        */
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.reset_needed = 0;
+                       ppd->qsfp_info.limiting_active = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+                       /* Invert the ModPresent pin now to detect plug-in */
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+                       if ((ppd->offline_disabled_reason >
+                         HFI1_ODR_MASK(
+                         OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
+                         (ppd->offline_disabled_reason ==
+                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+                               ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(
+                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+
+                       if (ppd->host_link_state == HLS_DN_POLL) {
+                               /*
+                                * The link is still in POLL. This means
+                                * that the normal link down processing
+                                * will not happen. We have to do it here
+                                * before turning the DC off.
+                                */
+                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+                       }
+               } else {
+                       dd_dev_info(dd, "%s: QSFP module inserted\n",
+                                   __func__);
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.cache_refresh_required = 1;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+
+                       /*
+                        * Stop inversion of ModPresent pin to detect
+                        * removal of the cable
+                        */
+                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+                       ppd->offline_disabled_reason =
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+               }
+       }
+
+       if (reg & QSFP_HFI0_INT_N) {
+               dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
+                           __func__);
+               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+               ppd->qsfp_info.check_interrupt_flags = 1;
+               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+       }
+
+       /* Schedule the QSFP work only if there is a cable attached. */
+       if (qsfp_mod_present(ppd))
+               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+                             (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                          __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+                             (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
+                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                          __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *     0 on success
+ *     -EBUSY if the 8051 has control and cannot be disturbed
+ *     -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       int ret = 0;
+
+       /*
+        * Use the host link state lock so the operation of this routine
+        * { link state check, selector change, count increment } can occur
+        * as a unit against a link state change.  Otherwise there is a
+        * race between the state change and the count increment.
+        */
+       if (sleep_ok) {
+               mutex_lock(&ppd->hls_lock);
+       } else {
+               while (!mutex_trylock(&ppd->hls_lock))
+                       udelay(1);
+       }
+
+       /* this access is valid only when the link is up */
+       if (ppd->host_link_state & HLS_DOWN) {
+               dd_dev_info(dd, "%s: link state %s not up\n",
+                           __func__, link_state_name(ppd->host_link_state));
+               ret = -EBUSY;
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 0) {
+               ret = request_host_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "%s: unable to acquire LCB access, err %d\n",
+                                  __func__, ret);
+                       goto done;
+               }
+               set_host_lcb_access(dd);
+       }
+       dd->lcb_access_count++;
+done:
+       mutex_unlock(&ppd->hls_lock);
+       return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *     0 on success
+ *     -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       int ret = 0;
+
+       /*
+        * Use the host link state lock because the acquire needed it.
+        * Here, we only need to keep { selector change, count decrement }
+        * as a unit.
+        */
+       if (sleep_ok) {
+               mutex_lock(&dd->pport->hls_lock);
+       } else {
+               while (!mutex_trylock(&dd->pport->hls_lock))
+                       udelay(1);
+       }
+
+       if (dd->lcb_access_count == 0) {
+               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+                          __func__);
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 1) {
+               set_8051_lcb_access(dd);
+               ret = request_8051_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "%s: unable to release LCB access, err %d\n",
+                                  __func__, ret);
+                       /* restore host access if the grant didn't work */
+                       set_host_lcb_access(dd);
+                       goto done;
+               }
+       }
+       dd->lcb_access_count--;
+done:
+       mutex_unlock(&dd->pport->hls_lock);
+       return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+       dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+                 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
+                 (u64)return_code <<
+                 DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
+                 (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle host requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u16 data = 0;
+       u8 type;
+
+       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+               return; /* no request */
+
+       /* zero out COMPLETED so the response is seen */
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+       /* extract request details */
+       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+       switch (type) {
+       case HREQ_LOAD_CONFIG:
+       case HREQ_SAVE_CONFIG:
+       case HREQ_READ_CONFIG:
+       case HREQ_SET_TX_EQ_ABS:
+       case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
+               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+                           type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       case HREQ_CONFIG_DONE:
+               hreq_response(dd, HREQ_SUCCESS, 0);
+               break;
+
+       case HREQ_INTERFACE_TEST:
+               hreq_response(dd, HREQ_SUCCESS, data);
+               break;
+       default:
+               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+                               u8 vau, u16 total, u16 shared)
+{
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+                 ((u64)total <<
+                  SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
+                 ((u64)shared <<
+                  SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
+                 ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+       /* leave shared count at zero for both global and VL15 */
+       write_global_credit(dd, vau, vl15buf, 0);
+
+       /* We may need some credits for another VL when sending packets
+        * with the snoop interface. Dividing it down the middle for VL15
+        * and VL0 should suffice.
+        */
+       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+       } else {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+       }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove all previous VL credit limits */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       write_global_credit(dd, 0, 0, 0);
+       /* reset the CM block */
+       pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+       return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+       return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+       return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+       u64 reg;
+
+       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+       write_csr(dd, DC_LCB_CFG_RUN, 0);
+       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+                 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+       reg = read_csr(dd, DCC_CFG_RESET);
+       write_csr(dd, DCC_CFG_RESET, reg |
+                 (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
+                 (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+       (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+       if (!abort) {
+               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+               write_csr(dd, DCC_CFG_RESET, reg);
+               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (dd->dc_shutdown) {
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               return;
+       }
+       dd->dc_shutdown = 1;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Shutdown the LCB */
+       lcb_shutdown(dd, 1);
+       /*
+        * Going to OFFLINE would have causes the 8051 to put the
+        * SerDes into reset already. Just need to shut down the 8051,
+        * itself.
+        */
+       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/*
+ * Calling this after the DC has been brought out of reset should not
+ * do any damage.
+ */
+static void dc_start(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (!dd->dc_shutdown)
+               goto done;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Take the 8051 out of reset */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+       /* Wait until 8051 is ready */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) {
+               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+                          __func__);
+       }
+       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+       write_csr(dd, DCC_CFG_RESET, 0x10);
+       /* lcb_shutdown() with abort=1 does not restore these */
+       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       dd->dc_shutdown = 0;
+done:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+       u64 rx_radr, tx_radr;
+       u32 version;
+
+       if (dd->icode != ICODE_FPGA_EMULATION)
+               return;
+
+       /*
+        * These LCB defaults on emulator _s are good, nothing to do here:
+        *      LCB_CFG_TX_FIFOS_RADR
+        *      LCB_CFG_RX_FIFOS_RADR
+        *      LCB_CFG_LN_DCLK
+        *      LCB_CFG_IGNORE_LOST_RCLK
+        */
+       if (is_emulator_s(dd))
+               return;
+       /* else this is _p */
+
+       version = emulator_rev(dd);
+       if (!is_ax(dd))
+               version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+       if (version <= 0x12) {
+               /* release 0x12 and below */
+
+               /*
+                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+                */
+               rx_radr =
+                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               /*
+                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+                */
+               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version <= 0x18) {
+               /* release 0x13 up to 0x18 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x19) {
+               /* release 0x19 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+               rx_radr =
+                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x1a) {
+               /* release 0x1a */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+       } else {
+               /* release 0x1b and higher */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+               rx_radr =
+                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       }
+
+       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+                 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       sma_message_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 msg;
+       int ret;
+
+       /*
+        * msg is bytes 1-4 of the 40-bit idle message - the command code
+        * is stripped off
+        */
+       ret = read_idle_sma(dd, &msg);
+       if (ret)
+               return;
+       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+       /*
+        * React to the SMA message.  Byte[1] (0 for us) is the command.
+        */
+       switch (msg & 0xff) {
+       case SMA_IDLE_ARM:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Only expected in INIT or ARMED, discard otherwise.
+                */
+               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+                       ppd->neighbor_normal = 1;
+               break;
+       case SMA_IDLE_ACTIVE:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Can activate the node.  Discard otherwise.
+                */
+               if (ppd->host_link_state == HLS_UP_ARMED &&
+                   ppd->is_active_optimize_enabled) {
+                       ppd->neighbor_normal = 1;
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret)
+                               dd_dev_err(
+                                       dd,
+                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
+                                       __func__);
+               }
+               break;
+       default:
+               dd_dev_err(dd,
+                          "%s: received unexpected SMA idle message 0x%llx\n",
+                          __func__, msg);
+               break;
+       }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+       u64 rcvctrl;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+       rcvctrl = read_csr(dd, RCV_CTRL);
+       rcvctrl |= add;
+       rcvctrl &= ~clear;
+       write_csr(dd, RCV_CTRL, rcvctrl);
+       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+       adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+       adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct send_context *sc;
+       int i;
+
+       if (flags & FREEZE_SELF)
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+       /* enter frozen mode */
+       dd->flags |= HFI1_FROZEN;
+
+       /* notify all SDMA engines that they are going into a freeze */
+       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+       /* do halt pre-handling on all enabled send contexts */
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (sc && (sc->flags & SCF_ENABLED))
+                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+       }
+
+       /* Send context are frozen. Notify user space */
+       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+       if (flags & FREEZE_ABORT) {
+               dd_dev_err(dd,
+                          "Aborted freeze recovery. Please REBOOT system\n");
+               return;
+       }
+       /* queue non-interrupt handler */
+       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if (freeze) {
+                       /* waiting until all indicators are set */
+                       if ((reg & ALL_FROZE) == ALL_FROZE)
+                               return; /* all done */
+               } else {
+                       /* waiting until all indicators are clear */
+                       if ((reg & ALL_FROZE) == 0)
+                               return; /* all done */
+               }
+
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+                                  freeze ? "" : "un", reg & ALL_FROZE,
+                                  freeze ? ALL_FROZE : 0ull);
+                       return;
+               }
+               usleep_range(80, 120);
+       }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* disable port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* disable all receive contexts */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       u32 rcvmask;
+       int i;
+
+       /* enable all kernel contexts */
+       for (i = 0; i < dd->n_krcv_queues; i++) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB;
+               /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               hfi1_rcvctrl(dd, rcvmask, i);
+       }
+
+       /* enable port */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               freeze_work);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* wait for freeze indicators on all affected blocks */
+       wait_for_freeze_status(dd, 1);
+
+       /* SPC is now frozen */
+
+       /* do send PIO freeze steps */
+       pio_freeze(dd);
+
+       /* do send DMA freeze steps */
+       sdma_freeze(dd);
+
+       /* do send egress freeze steps - nothing to do */
+
+       /* do receive freeze steps */
+       rxe_freeze(dd);
+
+       /*
+        * Unfreeze the hardware - clear the freeze, wait for each
+        * block's frozen bit to clear, then clear the frozen flag.
+        */
+       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       wait_for_freeze_status(dd, 0);
+
+       if (is_ax(dd)) {
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+               wait_for_freeze_status(dd, 1);
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+               wait_for_freeze_status(dd, 0);
+       }
+
+       /* do send PIO unfreeze steps for kernel contexts */
+       pio_kernel_unfreeze(dd);
+
+       /* do send DMA unfreeze steps */
+       sdma_unfreeze(dd);
+
+       /* do send egress unfreeze steps - nothing to do */
+
+       /* do receive unfreeze steps for kernel contexts */
+       rxe_kernel_unfreeze(dd);
+
+       /*
+        * The unfreeze procedure touches global device registers when
+        * it disables and re-enables RXE. Mark the device unfrozen
+        * after all that is done so other parts of the driver waiting
+        * for the device to unfreeze don't do things out of order.
+        *
+        * The above implies that the meaning of HFI1_FROZEN flag is
+        * "Device has gone into freeze mode and freeze mode handling
+        * is still in progress."
+        *
+        * The flag will be removed when freeze mode processing has
+        * completed.
+        */
+       dd->flags &= ~HFI1_FROZEN;
+       wake_up(&dd->event_queue);
+
+       /* no longer frozen */
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 link_up_work);
+       set_link_state(ppd, HLS_UP_INIT);
+
+       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_ltp_rtt(ppd->dd);
+       /*
+        * OPA specifies that certain counters are cleared on a transition
+        * to link up, so do that.
+        */
+       clear_linkup_counters(ppd->dd);
+       /*
+        * And (re)set link up default values.
+        */
+       set_linkup_defaults(ppd);
+
+       /* enforce link speed enabled */
+       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+               /* oops - current speed is not enabled, bounce */
+               dd_dev_err(ppd->dd,
+                          "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+                          ppd->link_speed_active, ppd->link_speed_enabled);
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+                                    OPA_LINKDOWN_REASON_SPEED_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down
+ */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+       ppd->neighbor_guid = 0;
+       ppd->neighbor_port_number = 0;
+       ppd->neighbor_type = 0;
+       ppd->neighbor_fm_security = 0;
+}
+
+static const char * const link_down_reason_strs[] = {
+       [OPA_LINKDOWN_REASON_NONE] = "None",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+                                       "Excessive buffer overrun",
+       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+                                       "Local media not installed",
+       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+                                       "End to end not installed",
+       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+       const char *str = NULL;
+
+       if (reason < ARRAY_SIZE(link_down_reason_strs))
+               str = link_down_reason_strs[reason];
+       if (!str)
+               str = "(invalid)";
+
+       return str;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+       u8 lcl_reason, neigh_reason = 0;
+       u8 link_down_reason;
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 link_down_work);
+       int was_up;
+       static const char ldr_str[] = "Link down reason: ";
+
+       if ((ppd->host_link_state &
+            (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
+            ppd->port_type == PORT_TYPE_FIXED)
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
+
+       /* Go offline first, then deal with reading/writing through 8051 */
+       was_up = !!(ppd->host_link_state & HLS_UP);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       if (was_up) {
+               lcl_reason = 0;
+               /* link down reason is only valid if the link was up */
+               read_link_down_reason(ppd->dd, &link_down_reason);
+               switch (link_down_reason) {
+               case LDR_LINK_TRANSFER_ACTIVE_LOW:
+                       /* the link went down, no idle message reason */
+                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+                                   ldr_str);
+                       break;
+               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+                       /*
+                        * The neighbor reason is only valid if an idle message
+                        * was received for it.
+                        */
+                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+                       dd_dev_info(ppd->dd,
+                                   "%sNeighbor link down message %d, %s\n",
+                                   ldr_str, neigh_reason,
+                                   link_down_reason_str(neigh_reason));
+                       break;
+               case LDR_RECEIVED_HOST_OFFLINE_REQ:
+                       dd_dev_info(ppd->dd,
+                                   "%sHost requested link to go offline\n",
+                                   ldr_str);
+                       break;
+               default:
+                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+                                   ldr_str, link_down_reason);
+                       break;
+               }
+
+               /*
+                * If no reason, assume peer-initiated but missed
+                * LinkGoingDown idle flits.
+                */
+               if (neigh_reason == 0)
+                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+       } else {
+               /* went down while polling or going up */
+               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+       }
+
+       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+       /* inform the SMA when the link transitions from up to down */
+       if (was_up && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                                       ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                                       ppd->neigh_link_down_reason.latest;
+       }
+
+       reset_neighbor_info(ppd);
+       if (ppd->mgmt_allowed)
+               remove_full_mgmt_pkey(ppd);
+
+       /* disable the port */
+       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /*
+        * If there is no cable attached, turn the DC off. Otherwise,
+        * start the link bring up.
+        */
+       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
+               dc_shutdown(ppd->dd);
+       } else {
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_bounce_work);
+
+       /*
+        * Only do something if the link is currently up.
+        */
+       if (ppd->host_link_state & HLS_UP) {
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       } else {
+               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+                           __func__, link_state_name(ppd->host_link_state));
+       }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+       if (cap & CAP_CRC_14B)
+               port_ltp |= PORT_LTP_CRC_MODE_14;
+       if (cap & CAP_CRC_48B)
+               port_ltp |= PORT_LTP_CRC_MODE_48;
+       if (cap & CAP_CRC_12B_16B_PER_LANE)
+               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+       return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+       int cap_mask = 0;
+
+       if (port_ltp & PORT_LTP_CRC_MODE_14)
+               cap_mask |= CAP_CRC_14B;
+       if (port_ltp & PORT_LTP_CRC_MODE_48)
+               cap_mask |= CAP_CRC_48B;
+       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+       return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+       int port_ltp = 0;
+
+       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+       else if (lcb_crc == LCB_CRC_48B)
+               port_ltp = PORT_LTP_CRC_MODE_48;
+       else if (lcb_crc == LCB_CRC_14B)
+               port_ltp = PORT_LTP_CRC_MODE_14;
+       else
+               port_ltp = PORT_LTP_CRC_MODE_16;
+
+       return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */
+       if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
+               dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+                           __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+       ppd->pkeys[2] = FULL_MGMT_P_KEY;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       ppd->pkeys[2] = 0;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+       switch (width) {
+       case 0:
+               /*
+                * Simulator and quick linkup do not set the width.
+                * Just set it to 4x without complaint.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+                       return OPA_LINK_WIDTH_4X;
+               return 0; /* no lanes up */
+       case 1: return OPA_LINK_WIDTH_1X;
+       case 2: return OPA_LINK_WIDTH_2X;
+       case 3: return OPA_LINK_WIDTH_3X;
+       default:
+               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+                           __func__, width);
+               /* fall through */
+       case 4: return OPA_LINK_WIDTH_4X;
+       }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+
+static inline u8 nibble_to_count(u8 nibble)
+{
+       return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *     enable_lane_tx
+ *     enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                           u16 *rx_width)
+{
+       u16 tx, rx;
+       u8 enable_lane_rx;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       u8 max_rate;
+
+       /* read the active lanes */
+       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                        &rx_polarity_inversion, &max_rate);
+       read_local_lni(dd, &enable_lane_rx);
+
+       /* convert to counts */
+       tx = nibble_to_count(enable_lane_tx);
+       rx = nibble_to_count(enable_lane_rx);
+
+       /*
+        * Set link_speed_active here, overriding what was set in
+        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+        * set the max_rate field in handle_verify_cap until v0.19.
+        */
+       if ((dd->icode == ICODE_RTL_SILICON) &&
+           (dd->dc8051_ver < dc8051_ver(0, 19))) {
+               /* max_rate: 0 = 12.5G, 1 = 25G */
+               switch (max_rate) {
+               case 0:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               default:
+                       dd_dev_err(dd,
+                                  "%s: unexpected max rate %d, using 25Gb\n",
+                                  __func__, (int)max_rate);
+                       /* fall through */
+               case 1:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       }
+
+       dd_dev_info(dd,
+                   "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+                   enable_lane_tx, tx, enable_lane_rx, rx);
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *     + bits [7:4] contain the number of active transmitters
+ *     + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                             u16 *rx_width)
+{
+       u16 widths, tx, rx;
+       u8 misc_bits, local_flags;
+       u16 active_tx, active_rx;
+
+       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+       tx = widths >> 12;
+       rx = (widths >> 8) & 0xf;
+
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+       u16 tx_width, rx_width;
+
+       /* get end-of-LNI link widths */
+       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+       /* use tx_width as the link is supposed to be symmetric on link up */
+       ppd->link_width_active = tx_width;
+       /* link width downgrade active (LWD.A) starts out matching LW.A */
+       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+       /* per OPA spec, on link up LWD.E resets to LWD.S */
+       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+       /* cache the active egress rate (units {10^6 bits/sec]) */
+       ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_vc_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u8 power_management;
+       u8 continious;
+       u8 vcu;
+       u8 vau;
+       u8 z;
+       u16 vl15buf;
+       u16 link_widths;
+       u16 crc_mask;
+       u16 crc_val;
+       u16 device_id;
+       u16 active_tx, active_rx;
+       u8 partner_supported_crc;
+       u8 remote_tx_rate;
+       u8 device_rev;
+
+       set_link_state(ppd, HLS_VERIFY_CAP);
+
+       lcb_shutdown(dd, 0);
+       adjust_lcb_for_fpga_serdes(dd);
+
+       /*
+        * These are now valid:
+        *      remote VerifyCap fields in the general LNI config
+        *      CSR DC8051_STS_REMOTE_GUID
+        *      CSR DC8051_STS_REMOTE_NODE_TYPE
+        *      CSR DC8051_STS_REMOTE_FM_SECURITY
+        *      CSR DC8051_STS_REMOTE_PORT_NO
+        */
+
+       read_vc_remote_phy(dd, &power_management, &continious);
+       read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
+                             &partner_supported_crc);
+       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+       read_remote_device_id(dd, &device_id, &device_rev);
+       /*
+        * And the 'MgmtAllowed' information, which is exchanged during
+        * LNI, is also be available at this point.
+        */
+       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+       dd_dev_info(dd,
+                   "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+                   (int)power_management, (int)continious);
+       dd_dev_info(dd,
+                   "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+                   (int)vau, (int)z, (int)vcu, (int)vl15buf,
+                   (int)partner_supported_crc);
+       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+                   (u32)remote_tx_rate, (u32)link_widths);
+       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+                   (u32)device_id, (u32)device_rev);
+       /*
+        * The peer vAU value just read is the peer receiver value.  HFI does
+        * not support a transmit vAU of 0 (AU == 8).  We advertised that
+        * with Z=1 in the fabric capabilities sent to the peer.  The peer
+        * will see our Z=1, and, if it advertised a vAU of 0, will move its
+        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+        * subject to the Z value exception.
+        */
+       if (vau == 0)
+               vau = 1;
+       set_up_vl15(dd, vau, vl15buf);
+
+       /* set up the LCB CRC mode */
+       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+       /* order is important: use the lowest bit in common */
+       if (crc_mask & CAP_CRC_14B)
+               crc_val = LCB_CRC_14B;
+       else if (crc_mask & CAP_CRC_48B)
+               crc_val = LCB_CRC_48B;
+       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+               crc_val = LCB_CRC_12B_16B_PER_LANE;
+       else
+               crc_val = LCB_CRC_16B;
+
+       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+       write_csr(dd, DC_LCB_CFG_CRC_MODE,
+                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+       /* set (14b only) or clear sideband credit */
+       reg = read_csr(dd, SEND_CM_CTRL);
+       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+               write_csr(dd, SEND_CM_CTRL,
+                         reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       } else {
+               write_csr(dd, SEND_CM_CTRL,
+                         reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       }
+
+       ppd->link_speed_active = 0;     /* invalid value */
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+               switch (remote_tx_rate) {
+               case 0:
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               case 1:
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       } else {
+               /* actual rate is highest bit of the ANDed rates */
+               u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+               if (rate & 2)
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+               else if (rate & 1)
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+       }
+       if (ppd->link_speed_active == 0) {
+               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+                          __func__, (int)remote_tx_rate);
+               ppd->link_speed_active = OPA_LINK_SPEED_25G;
+       }
+
+       /*
+        * Cache the values of the supported, enabled, and active
+        * LTP CRC modes to return in 'portinfo' queries. But the bit
+        * flags that are returned in the portinfo query differ from
+        * what's in the link_crc_mask, crc_sizes, and crc_val
+        * variables. Convert these here.
+        */
+       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* supported crc modes */
+       ppd->port_ltp_crc_mode |=
+               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+               /* enabled crc modes */
+       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+               /* active crc mode */
+
+       /* set up the remote credit return table */
+       assign_remote_cm_au_table(dd, vcu);
+
+       /*
+        * The LCB is reset on entry to handle_verify_cap(), so this must
+        * be applied on every link up.
+        *
+        * Adjust LCB error kill enable to kill the link if
+        * these RBUF errors are seen:
+        *      REPLAY_BUF_MBE_SMASK
+        *      FLIT_INPUT_BUF_MBE_SMASK
+        */
+       if (is_ax(dd)) {                        /* fixed in B0 */
+               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+       }
+
+       /* pull LCB fifos out of reset - all fifo clocks must be stable */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* give 8051 access to the LCB CSRs */
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       ppd->neighbor_guid =
+               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+       ppd->neighbor_type =
+               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+       ppd->neighbor_fm_security =
+               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+       dd_dev_info(dd,
+                   "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+                   ppd->neighbor_guid, ppd->neighbor_type,
+                   ppd->mgmt_allowed, ppd->neighbor_fm_security);
+       if (ppd->mgmt_allowed)
+               add_full_mgmt_pkey(ppd);
+
+       /* tell the 8051 to go to LinkUp */
+       set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+       int do_bounce = 0;
+       int tries;
+       u16 lwde;
+       u16 tx, rx;
+
+       /* use the hls lock to avoid a race with actual link up */
+       tries = 0;
+retry:
+       mutex_lock(&ppd->hls_lock);
+       /* only apply if the link is up */
+       if (ppd->host_link_state & HLS_DOWN) {
+               /* still going up..wait and retry */
+               if (ppd->host_link_state & HLS_GOING_UP) {
+                       if (++tries < 1000) {
+                               mutex_unlock(&ppd->hls_lock);
+                               usleep_range(100, 120); /* arbitrary */
+                               goto retry;
+                       }
+                       dd_dev_err(ppd->dd,
+                                  "%s: giving up waiting for link state change\n",
+                                  __func__);
+               }
+               goto done;
+       }
+
+       lwde = ppd->link_width_downgrade_enabled;
+
+       if (refresh_widths) {
+               get_link_widths(ppd->dd, &tx, &rx);
+               ppd->link_width_downgrade_tx_active = tx;
+               ppd->link_width_downgrade_rx_active = rx;
+       }
+
+       if (ppd->link_width_downgrade_tx_active == 0 ||
+           ppd->link_width_downgrade_rx_active == 0) {
+               /* the 8051 reported a dead link as a downgrade */
+               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+       } else if (lwde == 0) {
+               /* downgrade is disabled */
+
+               /* bounce if not at starting active width */
+               if ((ppd->link_width_active !=
+                    ppd->link_width_downgrade_tx_active) ||
+                   (ppd->link_width_active !=
+                    ppd->link_width_downgrade_rx_active)) {
+                       dd_dev_err(ppd->dd,
+                                  "Link downgrade is disabled and link has downgraded, downing link\n");
+                       dd_dev_err(ppd->dd,
+                                  "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+                                  ppd->link_width_active,
+                                  ppd->link_width_downgrade_tx_active,
+                                  ppd->link_width_downgrade_rx_active);
+                       do_bounce = 1;
+               }
+       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
+                  (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+               /* Tx or Rx is outside the enabled policy */
+               dd_dev_err(ppd->dd,
+                          "Link is outside of downgrade allowed, downing link\n");
+               dd_dev_err(ppd->dd,
+                          "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+                          lwde, ppd->link_width_downgrade_tx_active,
+                          ppd->link_width_downgrade_rx_active);
+               do_bounce = 1;
+       }
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (do_bounce) {
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+                                    OPA_LINKDOWN_REASON_WIDTH_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               tune_serdes(ppd);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_downgrade_work);
+
+       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+       apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dcc_err_flags,
+               ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, lcb_err_flags,
+               ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_err_flags,
+               ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+               ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+               ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 info, err, host_msg;
+       int queue_link_down = 0;
+       char buf[96];
+
+       /* look at the flags */
+       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+               /* 8051 information set by firmware */
+               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+               host_msg = (info >>
+                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+               /*
+                * Handle error flags.
+                */
+               if (err & FAILED_LNI) {
+                       /*
+                        * LNI error indications are cleared by the 8051
+                        * only when starting polling.  Only pay attention
+                        * to them when in the states that occur during
+                        * LNI.
+                        */
+                       if (ppd->host_link_state
+                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+                               queue_link_down = 1;
+                               dd_dev_info(dd, "Link error: %s\n",
+                                           dc8051_info_err_string(buf,
+                                                                  sizeof(buf),
+                                                                  err &
+                                                                  FAILED_LNI));
+                       }
+                       err &= ~(u64)FAILED_LNI;
+               }
+               /* unknown frames can happen durning LNI, just count */
+               if (err & UNKNOWN_FRAME) {
+                       ppd->unknown_frame_count++;
+                       err &= ~(u64)UNKNOWN_FRAME;
+               }
+               if (err) {
+                       /* report remaining errors, but do not do anything */
+                       dd_dev_err(dd, "8051 info error: %s\n",
+                                  dc8051_info_err_string(buf, sizeof(buf),
+                                                         err));
+               }
+
+               /*
+                * Handle host message flags.
+                */
+               if (host_msg & HOST_REQ_DONE) {
+                       /*
+                        * Presently, the driver does a busy wait for
+                        * host requests to complete.  This is only an
+                        * informational message.
+                        * NOTE: The 8051 clears the host message
+                        * information *on the next 8051 command*.
+                        * Therefore, when linkup is achieved,
+                        * this flag will still be set.
+                        */
+                       host_msg &= ~(u64)HOST_REQ_DONE;
+               }
+               if (host_msg & BC_SMA_MSG) {
+                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+                       host_msg &= ~(u64)BC_SMA_MSG;
+               }
+               if (host_msg & LINKUP_ACHIEVED) {
+                       dd_dev_info(dd, "8051: Link up\n");
+                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+                       host_msg &= ~(u64)LINKUP_ACHIEVED;
+               }
+               if (host_msg & EXT_DEVICE_CFG_REQ) {
+                       handle_8051_request(ppd);
+                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+               }
+               if (host_msg & VERIFY_CAP_FRAME) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
+               }
+               if (host_msg & LINK_GOING_DOWN) {
+                       const char *extra = "";
+                       /* no downgrade action needed if going down */
+                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+                               extra = " (ignoring downgrade)";
+                       }
+                       dd_dev_info(dd, "8051: Link down%s\n", extra);
+                       queue_link_down = 1;
+                       host_msg &= ~(u64)LINK_GOING_DOWN;
+               }
+               if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+               }
+               if (host_msg) {
+                       /* report remaining messages, but do not do anything */
+                       dd_dev_info(dd, "8051 info host message: %s\n",
+                                   dc8051_info_host_msg_string(buf,
+                                                               sizeof(buf),
+                                                               host_msg));
+               }
+
+               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+       }
+       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+               /*
+                * Lost the 8051 heartbeat.  If this happens, we
+                * receive constant interrupts about it.  Disable
+                * the interrupt after the first.
+                */
+               dd_dev_err(dd, "Lost 8051 heartbeat\n");
+               write_csr(dd, DC_DC8051_ERR_EN,
+                         read_csr(dd, DC_DC8051_ERR_EN) &
+                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+       }
+       if (reg) {
+               /* report the error, but do not do anything */
+               dd_dev_err(dd, "8051 error: %s\n",
+                          dc8051_err_string(buf, sizeof(buf), reg));
+       }
+
+       if (queue_link_down) {
+               /*
+                * if the link is already going down or disabled, do not
+                * queue another
+                */
+               if ((ppd->host_link_state &
+                   (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
+                   ppd->link_enabled == 0) {
+                       dd_dev_info(dd, "%s: not queuing link down\n",
+                                   __func__);
+               } else {
+                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+               }
+       }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+       "BadHeadDist: Distance violation between two head flits",
+[1] =
+       "BadTailDist: Distance violation between two tail flits",
+[2] =
+       "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+       "BadCrdAck: Credits return for unsupported VL",
+[4] =
+       "UnsupportedVLMarker: Received VL Marker",
+[5] =
+       "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+       "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+       "BadPktLen: Illegal PktLen",
+[2] =
+       "PktLenTooLong: Packet longer than PktLen",
+[3] =
+       "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+       "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+       "BadL2: Illegal L2 opcode",
+[7] =
+       "BadSC: Unsupported SC",
+[9] =
+       "BadRC: Illegal RC",
+[11] =
+       "PreemptError: Preempting with same VL",
+[12] =
+       "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 info, hdr0, hdr1;
+       const char *extra;
+       char buf[96];
+       struct hfi1_pportdata *ppd = dd->pport;
+       u8 lcl_reason = 0;
+       int do_bounce = 0;
+
+       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+               }
+               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+               struct hfi1_pportdata *ppd = dd->pport;
+               /* this counter saturates at (2^32) - 1 */
+               if (ppd->link_downed < (u32)UINT_MAX)
+                       ppd->link_downed++;
+               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+               }
+               switch (info) {
+               case 0:
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+                       extra = fm_config_txt[info];
+                       break;
+               case 8:
+                       extra = fm_config_txt[info];
+                       if (ppd->port_error_action &
+                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+                               do_bounce = 1;
+                               /*
+                                * lcl_reason cannot be derived from info
+                                * for this error
+                                */
+                               lcl_reason =
+                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+                       }
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+               if (!(dd->err_info_rcvport.status_and_code &
+                     OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_rcvport.status_and_code =
+                               info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_rcvport.status_and_code |=
+                               OPA_EI_STATUS_SMASK;
+                       /*
+                        * save first 2 flits in the packet that caused
+                        * the error
+                        */
+                        dd->err_info_rcvport.packet_flit1 = hdr0;
+                        dd->err_info_rcvport.packet_flit2 = hdr1;
+               }
+               switch (info) {
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+               case 7:
+               case 9:
+               case 11:
+               case 12:
+                       extra = port_rcv_txt[info];
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
+                           hdr0, hdr1);
+
+               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "8051 access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+       }
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "host access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+       }
+
+       /* report any remaining errors */
+       if (reg)
+               dd_dev_info(dd, "DCC Error: %s\n",
+                           dcc_err_string(buf, sizeof(buf), reg));
+
+       if (lcl_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+       if (do_bounce) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "LCB Error: %s\n",
+                   lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &dc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else if (source == 3 /* dc_lbm_int */) {
+               /*
+                * This indicates that a parity error has occurred on the
+                * address/control lines presented to the LBM.  The error
+                * is a single pulse, there is no associated error flag,
+                * and it is non-maskable.  This is because if a parity
+                * error occurs on the request the request is dropped.
+                * This should never occur, but it is nice to know if it
+                * ever does.
+                */
+               dd_dev_err(dd, "Parity error in DC LBM block\n");
+       } else {
+               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+       }
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *      0 -  N-1 = SDma
+ *      N - 2N-1 = SDmaProgress
+ *     2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+       if (likely(what < 3 && which < dd->num_sdma)) {
+               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+       } else {
+               /* should not happen */
+               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+       }
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       if (source < dd->first_user_ctxt)
+                               rcd->do_interrupt(rcd, 0);
+                       else
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+                  err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       /* only pay attention to user urgent interrupts */
+                       if (source >= dd->first_user_ctxt)
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+                  err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       char name[64];
+
+       dd_dev_err(dd, "unexpected %s interrupt\n",
+                  is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/*
+ * start                end
+ *                             name func               interrupt func
+ */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+                               is_misc_err_name,       is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+                               is_sendctxt_err_name,   is_sendctxt_err_int },
+{ IS_SDMA_START,            IS_SDMA_END,
+                               is_sdma_eng_name,       is_sdma_eng_int },
+{ IS_VARIOUS_START,         IS_VARIOUS_END,
+                               is_various_name,        is_various_int },
+{ IS_DC_START,      IS_DC_END,
+                               is_dc_name,             is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+                               is_rcv_avail_name,      is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+                               is_rcv_urgent_name,     is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+                               is_send_credit_name,    is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+                               is_reserved_name,       is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct is_table *entry;
+
+       /* avoids a double compare by walking the table in-order */
+       for (entry = &is_table[0]; entry->is_name; entry++) {
+               if (source < entry->end) {
+                       trace_hfi1_interrupt(dd, entry, source);
+                       entry->is_int(dd, source - entry->start);
+                       return;
+               }
+       }
+       /* fell off the end */
+       dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+       struct hfi1_devdata *dd = data;
+       u64 regs[CCE_NUM_INT_CSRS];
+       u32 bit;
+       int i;
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* phase 1: scan and clear all handled interrupts */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               if (dd->gi_mask[i] == 0) {
+                       regs[i] = 0;    /* used later */
+                       continue;
+               }
+               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+                               dd->gi_mask[i];
+               /* only clear if anything is set */
+               if (regs[i])
+                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+       }
+
+       /* phase 2: call the appropriate handler */
+       for_each_set_bit(bit, (unsigned long *)&regs[0],
+                        CCE_NUM_INT_CSRS * 64) {
+               is_interrupt(dd, bit);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+       struct sdma_engine *sde = data;
+       struct hfi1_devdata *dd = sde->dd;
+       u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(sde);
+#endif
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* This read_csr is really bad in the hot path */
+       status = read_csr(dd,
+                         CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
+                         & sde->imask;
+       if (likely(status)) {
+               /* clear the interrupt(s) */
+               write_csr(dd,
+                         CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
+                         status);
+
+               /* handle the interrupt(s) */
+               sdma_engine_interrupt(sde, status);
+       } else
+               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+                          sde->this_idx);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Clear the receive interrupt.  Use a read of the interrupt clear CSR
+ * to insure that the write completed.  This does NOT guarantee that
+ * queued DMA writes to memory from the chip are pushed.
+ */
+static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
+
+       mmiowb();       /* make sure everything before is written */
+       write_csr(dd, addr, rcd->imask);
+       /* force the above write on the chip and get a value back */
+       (void)read_csr(dd, addr);
+}
+
+/* force the receive interrupt */
+void force_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
+}
+
+/*
+ * Return non-zero if a packet is present.
+ *
+ * This routine is called when rechecking for packets after the RcvAvail
+ * interrupt has been cleared down.  First, do a quick check of memory for
+ * a packet present.  If not found, use an expensive CSR read of the context
+ * tail to determine the actual tail.  The CSR read is necessary because there
+ * is no method to push pending DMAs to memory other than an interrupt and we
+ * are trying to determine if we need to force an interrupt.
+ */
+static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
+{
+       u32 tail;
+       int present;
+
+       if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
+               present = (rcd->seq_cnt ==
+                               rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+       else /* is RDMA rtail */
+               present = (rcd->head != get_rcvhdrtail(rcd));
+
+       if (present)
+               return 1;
+
+       /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
+       tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+       return rcd->head != tail;
+}
+
+/*
+ * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
+ * This routine will try to handle packets immediately (latency), but if
+ * it finds too many, it will invoke the thread handler (bandwitdh).  The
+ * chip receive interrupt is *not* cleared down until this or the thread (if
+ * invoked) is finished.  The intent is to avoid extra interrupts while we
+ * are processing packets anyway.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       struct hfi1_devdata *dd = rcd->dd;
+       int disposition;
+       int present;
+
+       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+       this_cpu_inc(*dd->int_counter);
+       aspm_ctx_disable(rcd);
+
+       /* receive interrupt remains blocked while processing packets */
+       disposition = rcd->do_interrupt(rcd, 0);
+
+       /*
+        * Too many packets were seen while processing packets in this
+        * IRQ handler.  Invoke the handler thread.  The receive interrupt
+        * remains blocked.
+        */
+       if (disposition == RCV_PKT_LIMIT)
+               return IRQ_WAKE_THREAD;
+
+       /*
+        * The packet processor detected no more packets.  Clear the receive
+        * interrupt and recheck for a packet packet that may have arrived
+        * after the previous check and interrupt clear.  If a packet arrived,
+        * force another interrupt.
+        */
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Receive packet thread handler.  This expects to be invoked with the
+ * receive interrupt still blocked.
+ */
+static irqreturn_t receive_context_thread(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       int present;
+
+       /* receive interrupt is still blocked from the IRQ handler */
+       (void)rcd->do_interrupt(rcd, 1);
+
+       /*
+        * The packet processor will only return if it detected no more
+        * packets.  Hold IRQs here so we can safely clear the interrupt and
+        * recheck for a packet that may have arrived after the previous
+        * check and the interrupt clear.  If a packet arrived, force another
+        * interrupt.
+        */
+       local_irq_disable();
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+       local_irq_enable();
+
+       return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+u32 read_logical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       /* clear current state, set new state */
+       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       *data = read_csr(dd, addr);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return read_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       *data = read_csr(dd, addr);
+       return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+           (dd->dc8051_ver < dc8051_ver(0, 20))) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       write_csr(dd, addr, data);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return write_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       write_csr(dd, addr, data);
+       return 0;
+}
+
+/*
+ * Returns:
+ *     < 0 = Linux error, not able to get access
+ *     > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+       struct hfi1_devdata *dd,
+       u32 type,
+       u64 in_data,
+       u64 *out_data)
+{
+       u64 reg, completed;
+       int return_code;
+       unsigned long flags;
+       unsigned long timeout;
+
+       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+       /*
+        * Alternative to holding the lock for a long time:
+        * - keep busy wait - have other users bounce off
+        */
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+       /* We can't send any commands to the 8051 if it's in reset */
+       if (dd->dc_shutdown) {
+               return_code = -ENODEV;
+               goto fail;
+       }
+
+       /*
+        * If an 8051 host command timed out previously, then the 8051 is
+        * stuck.
+        *
+        * On first timeout, attempt to reset and restart the entire DC
+        * block (including 8051). (Is this too big of a hammer?)
+        *
+        * If the 8051 times out a second time, the reset did not bring it
+        * back to healthy life. In that case, fail any subsequent commands.
+        */
+       if (dd->dc8051_timed_out) {
+               if (dd->dc8051_timed_out > 1) {
+                       dd_dev_err(dd,
+                                  "Previous 8051 host command timed out, skipping command %u\n",
+                                  type);
+                       return_code = -ENXIO;
+                       goto fail;
+               }
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               dc_shutdown(dd);
+               dc_start(dd);
+               spin_lock_irqsave(&dd->dc8051_lock, flags);
+       }
+
+       /*
+        * If there is no timeout, then the 8051 command interface is
+        * waiting for a command.
+        */
+
+       /*
+        * When writing a LCB CSR, out_data contains the full value to
+        * to be written, while in_data contains the relative LCB
+        * address in 7:0.  Do the work here, rather than the caller,
+        * of distrubting the write data to where it needs to go:
+        *
+        * Write data
+        *   39:00 -> in_data[47:8]
+        *   47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
+        *   63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
+        */
+       if (type == HCMD_WRITE_LCB_CSR) {
+               in_data |= ((*out_data) & 0xffffffffffull) << 8;
+               reg = ((((*out_data) >> 40) & 0xff) <<
+                               DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
+                     | ((((*out_data) >> 48) & 0xffff) <<
+                               DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+               write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
+       }
+
+       /*
+        * Do two writes: the first to stabilize the type and req_data, the
+        * second to activate.
+        */
+       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+       /* wait for completion, alternate: interrupt */
+       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+               if (completed)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd->dc8051_timed_out++;
+                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
+                       if (out_data)
+                               *out_data = 0;
+                       return_code = -ETIMEDOUT;
+                       goto fail;
+               }
+               udelay(2);
+       }
+
+       if (out_data) {
+               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+               if (type == HCMD_READ_LCB_CSR) {
+                       /* top 16 bits are in a different register */
+                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+                               << (48
+                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+               }
+       }
+       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+       dd->dc8051_timed_out = 0;
+       /*
+        * Clear command for next user.
+        */
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+       return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+                    u8 lane_id, u32 config_data)
+{
+       u64 data;
+       int ret;
+
+       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                          "load 8051 config: field id %d, lane %d, err %d\n",
+                          (int)field_id, (int)lane_id, ret);
+       }
+       return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+                    u32 *result)
+{
+       u64 big_data;
+       u32 addr;
+       int ret;
+
+       /* address start depends on the lane_id */
+       if (lane_id < 4)
+               addr = (4 * NUM_GENERAL_FIELDS)
+                       + (lane_id * 4 * NUM_LANE_FIELDS);
+       else
+               addr = 0;
+       addr += field_id * 4;
+
+       /* read is in 8-byte chunks, hardware will truncate the address down */
+       ret = read_8051_data(dd, addr, 8, &big_data);
+
+       if (ret == 0) {
+               /* extract the 4 bytes we want */
+               if (addr & 0x4)
+                       *result = (u32)(big_data >> 32);
+               else
+                       *result = (u32)big_data;
+       } else {
+               *result = 0;
+               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+                          __func__, lane_id, field_id);
+       }
+
+       return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+                             u8 continuous)
+{
+       u32 frame;
+
+       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+               | power_management << POWER_MANAGEMENT_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+                               GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+                                u16 vl15buf, u8 crc_sizes)
+{
+       u32 frame;
+
+       frame = (u32)vau << VAU_SHIFT
+               | (u32)z << Z_SHIFT
+               | (u32)vcu << VCU_SHIFT
+               | (u32)vl15buf << VL15BUF_SHIFT
+               | (u32)crc_sizes << CRC_SIZES_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+                               GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                        &frame);
+       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+                                    u8 misc_bits,
+                                    u8 flag_bits,
+                                    u16 link_widths)
+{
+       u32 frame;
+
+       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+               | (u32)link_widths << LINK_WIDTH_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                    frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+                                u8 device_rev)
+{
+       u32 frame;
+
+       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+                       & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+       u32 frame;
+
+       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+                                       & POWER_MANAGEMENT_MASK;
+       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+       *z = (frame >> Z_SHIFT) & Z_MASK;
+       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate,
+                                     u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+                        &frame);
+       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+                               & REMOTE_TX_RATE_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+       u32 frame;
+
+       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+       u32 frame;
+       int ret;
+
+       *link_quality = 0;
+       if (dd->pport->host_link_state & HLS_UP) {
+               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+                                      &frame);
+               if (ret == 0)
+                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
+                                               & LINK_QUALITY_MASK;
+       }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+       *ldr = (frame & 0xff);
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+                           u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion,
+                           u8 *max_rate)
+{
+       u32 frame;
+       int ret;
+
+       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+                               & ENABLE_LANE_TX_MASK;
+       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+                               & TX_POLARITY_INVERSION_MASK;
+       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+                               & RX_POLARITY_INVERSION_MASK;
+       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+       return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+                            u8 enable_lane_tx,
+                            u8 tx_polarity_inversion,
+                            u8 rx_polarity_inversion,
+                            u8 max_rate)
+{
+       u32 frame;
+
+       /* no need to mask, all variable sizes match field widths */
+       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+               | max_rate << MAX_RATE_SHIFT;
+       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+       u32 frame, version, prod_id;
+       int ret, lane;
+
+       /* 4 lanes */
+       for (lane = 0; lane < 4; lane++) {
+               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "Unable to read lane %d firmware details\n",
+                                  lane);
+                       continue;
+               }
+               version = (frame >> SPICO_ROM_VERSION_SHIFT)
+                                       & SPICO_ROM_VERSION_MASK;
+               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+                                       & SPICO_ROM_PROD_ID_MASK;
+               dd_dev_info(dd,
+                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+                           lane, version, prod_id);
+       }
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "read idle message: type %d, err %d\n",
+                          (u32)type, ret);
+               return -EINVAL;
+       }
+       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+       /* return only the payload as we already know the type */
+       *data_out >>= IDLE_PAYLOAD_SHIFT;
+       return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+       return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
+                                data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+       int ret;
+
+       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+                          data, ret);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+       u64 data;
+
+       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
+               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+       return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       unsigned long timeout;
+       int ret;
+
+       lcb_shutdown(dd, 0);
+
+       if (loopback) {
+               /* LCB_CFG_LOOPBACK.VAL = 2 */
+               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+               write_csr(dd, DC_LCB_CFG_LOOPBACK,
+                         IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+       }
+
+       /* start the LCBs */
+       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* simulator only loopback steps */
+       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               /* LCB_CFG_RUN.EN = 1 */
+               write_csr(dd, DC_LCB_CFG_RUN,
+                         1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+               timeout = jiffies + msecs_to_jiffies(10);
+               while (1) {
+                       reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+                       if (reg)
+                               break;
+                       if (time_after(jiffies, timeout)) {
+                               dd_dev_err(dd,
+                                          "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+                               return -ETIMEDOUT;
+                       }
+                       udelay(2);
+               }
+
+               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+                         1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+       }
+
+       if (!loopback) {
+               /*
+                * When doing quick linkup and not in loopback, both
+                * sides must be done with LCB set-up before either
+                * starts the quick linkup.  Put a delay here so that
+                * both sides can be started and have a chance to be
+                * done with LCB set up before resuming.
+                */
+               dd_dev_err(dd,
+                          "Pausing for peer to be finished with LCB set up\n");
+               msleep(5000);
+               dd_dev_err(dd, "Continuing with quick linkup\n");
+       }
+
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       /*
+        * State "quick" LinkUp request sets the physical link state to
+        * LinkUp without a verify capability sequence.
+        * This state is in simulator v37 and later.
+        */
+       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                          "%s: set physical link state to quick LinkUp failed with return %d\n",
+                          __func__, ret);
+
+               set_host_lcb_access(dd);
+               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+               if (ret >= 0)
+                       ret = -EINVAL;
+               return ret;
+       }
+
+       return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+       dd_dev_err(dd,
+                  "Set physical link state to SerDes Loopback failed with return %d\n",
+                  ret);
+       if (ret >= 0)
+               ret = -EINVAL;
+       return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+       dd_dev_info(dd, "Entering loopback mode\n");
+
+       /* all loopbacks should disable self GUID check */
+       write_csr(dd, DC_DC8051_CFG_MODE,
+                 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+       /*
+        * The simulator has only one loopback option - LCB.  Switch
+        * to that option, which includes quick link up.
+        *
+        * Accept all valid loopback values.
+        */
+       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
+           (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+            loopback == LOOPBACK_CABLE)) {
+               loopback = LOOPBACK_LCB;
+               quick_linkup = 1;
+               return 0;
+       }
+
+       /* handle serdes loopback */
+       if (loopback == LOOPBACK_SERDES) {
+               /* internal serdes loopack needs quick linkup on RTL */
+               if (dd->icode == ICODE_RTL_SILICON)
+                       quick_linkup = 1;
+               return set_serdes_loopback_mode(dd);
+       }
+
+       /* LCB loopback - handled at poll time */
+       if (loopback == LOOPBACK_LCB) {
+               quick_linkup = 1; /* LCB is always quick linkup */
+
+               /* not supported in emulation due to emulation RTL changes */
+               if (dd->icode == ICODE_FPGA_EMULATION) {
+                       dd_dev_err(dd,
+                                  "LCB loopback not supported in emulation\n");
+                       return -EINVAL;
+               }
+               return 0;
+       }
+
+       /* external cable loopback requires no extra steps */
+       if (loopback == LOOPBACK_CABLE)
+               return 0;
+
+       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+       return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+       int i;
+       u16 result = 0;
+
+       static const struct link_bits {
+               u16 from;
+               u16 to;
+       } opa_link_xlate[] = {
+               { OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
+               { OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
+               { OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
+               { OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
+       };
+
+       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+               if (opa_widths & opa_link_xlate[i].from)
+                       result |= opa_link_xlate[i].to;
+       }
+       return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       int ret;
+
+       /* reset our fabric serdes to clear any lingering problems */
+       fabric_serdes_reset(dd);
+
+       /* set the local tx rate - need to read-modify-write */
+       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                              &rx_polarity_inversion, &ppd->local_tx_rate);
+       if (ret)
+               goto set_local_link_attributes_fail;
+
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* set the tx rate to the fastest enabled */
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate = 1;
+               else
+                       ppd->local_tx_rate = 0;
+       } else {
+               /* set the tx rate to all enabled */
+               ppd->local_tx_rate = 0;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate |= 2;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+                       ppd->local_tx_rate |= 1;
+       }
+
+       enable_lane_tx = 0xF; /* enable all four lanes */
+       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+                               rx_polarity_inversion, ppd->local_tx_rate);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /*
+        * DC supports continuous updates.
+        */
+       ret = write_vc_local_phy(dd,
+                                0 /* no power management */,
+                                1 /* continuous updates */);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* z=1 in the next call: AU of 0 is not supported by the hardware */
+       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+                                   ppd->port_crc_mode_enabled);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       ret = write_vc_local_link_width(dd, 0, 0,
+                                       opa_to_vc_link_widths(
+                                               ppd->link_width_enabled));
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* let peer know who we are */
+       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+
+set_local_link_attributes_fail:
+       dd_dev_err(dd,
+                  "Failed to set local link attributes, return 0x%x\n",
+                  ret);
+       return ret;
+}
+
+/*
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+       if (!ppd->link_enabled) {
+               dd_dev_info(ppd->dd,
+                           "%s: stopping link start because link is disabled\n",
+                           __func__);
+               return 0;
+       }
+       if (!ppd->driver_link_ready) {
+               dd_dev_info(ppd->dd,
+                           "%s: stopping link start because driver is not ready\n",
+                           __func__);
+               return 0;
+       }
+
+       return set_link_state(ppd, HLS_DN_POLL);
+}
+
+static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+       unsigned long timeout;
+
+       /*
+        * Check for QSFP interrupt for t_init (SFF 8679)
+        */
+       timeout = jiffies + msecs_to_jiffies(2000);
+       while (1) {
+               mask = read_csr(dd, dd->hfi1_id ?
+                               ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+               if (!(mask & QSFP_HFI0_INT_N)) {
+                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
+                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
+                       break;
+               }
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
+                                   __func__);
+                       break;
+               }
+               udelay(2);
+       }
+}
+
+static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask;
+
+       mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
+       if (enable)
+               mask |= (u64)QSFP_HFI0_INT_N;
+       else
+               mask &= ~(u64)QSFP_HFI0_INT_N;
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
+}
+
+void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask, qsfp_mask;
+
+       /* Disable INT_N from triggering QSFP interrupts */
+       set_qsfp_int_n(ppd, 0);
+
+       /* Reset the QSFP */
+       mask = (u64)QSFP_HFI0_RESET_N;
+
+       qsfp_mask = read_csr(dd,
+                            dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+       qsfp_mask &= ~mask;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+       udelay(10);
+
+       qsfp_mask |= mask;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+       wait_for_qsfp_init(ppd);
+
+       /*
+        * Allow INT_N to trigger the QSFP interrupt to watch
+        * for alarms and warnings
+        */
+       set_qsfp_int_n(ppd, 1);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+                                       u8 *qsfp_interrupt_status)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+           (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable on fire\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+           (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+               dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
+                           __func__);
+
+       /*
+        * The remaining alarms/warnings don't matter if the link is down.
+        */
+       if (ppd->host_link_state & HLS_DOWN)
+               return 0;
+
+       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+           (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+           (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+               dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
+                           __func__);
+
+       /* Byte 2 is vendor specific */
+
+       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+           (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+           (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+           (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+           (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+           (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
+                           __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+           (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
+                           __func__);
+
+       /* Bytes 9-10 and 11-12 are reserved */
+       /* Bytes 13-15 are vendor specific */
+
+       return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module present is asserted */
+void qsfp_event(struct work_struct *work)
+{
+       struct qsfp_data *qd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+
+       qd = container_of(work, struct qsfp_data, qsfp_work);
+       ppd = qd->ppd;
+       dd = ppd->dd;
+
+       /* Sanity check */
+       if (!qsfp_mod_present(ppd))
+               return;
+
+       /*
+        * Turn DC back on after cable has been re-inserted. Up until
+        * now, the DC has been in reset to save power.
+        */
+       dc_start(dd);
+
+       if (qd->cache_refresh_required) {
+               set_qsfp_int_n(ppd, 0);
+
+               wait_for_qsfp_init(ppd);
+
+               /*
+                * Allow INT_N to trigger the QSFP interrupt to watch
+                * for alarms and warnings
+                */
+               set_qsfp_int_n(ppd, 1);
+
+               tune_serdes(ppd);
+
+               start_link(ppd);
+       }
+
+       if (qd->check_interrupt_flags) {
+               u8 qsfp_interrupt_status[16] = {0,};
+
+               if (one_qsfp_read(ppd, dd->hfi1_id, 6,
+                                 &qsfp_interrupt_status[0], 16) != 16) {
+                       dd_dev_info(dd,
+                                   "%s: Failed to read status of QSFP module\n",
+                                   __func__);
+               } else {
+                       unsigned long flags;
+
+                       handle_qsfp_error_conditions(
+                                       ppd, qsfp_interrupt_status);
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.check_interrupt_flags = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                              flags);
+               }
+       }
+}
+
+static void init_qsfp_int(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 qsfp_mask, cce_int_mask;
+       const int qsfp1_int_smask = QSFP1_INT % 64;
+       const int qsfp2_int_smask = QSFP2_INT % 64;
+
+       /*
+        * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+        * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+        * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+        * the index of the appropriate CSR in the CCEIntMask CSR array
+        */
+       cce_int_mask = read_csr(dd, CCE_INT_MASK +
+                               (8 * (QSFP1_INT / 64)));
+       if (dd->hfi1_id) {
+               cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
+                         cce_int_mask);
+       } else {
+               cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+               write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
+                         cce_int_mask);
+       }
+
+       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+       /* Clear current status to avoid spurious interrupts */
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+                 qsfp_mask);
+       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+                 qsfp_mask);
+
+       set_qsfp_int_n(ppd, 0);
+
+       /* Handle active low nature of INT_N and MODPRST_N pins */
+       if (qsfp_mod_present(ppd))
+               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+                 qsfp_mask);
+}
+
+/*
+ * Do a one-time initialize of the LCB block.
+ */
+static void init_lcb(struct hfi1_devdata *dd)
+{
+       /* simulator does not correctly handle LCB cclk loopback, skip */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return;
+
+       /* the DC has been reset earlier in the driver load */
+
+       /* set LCB for cclk loopback on the port */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
+       write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
+       write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
+       write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
+       write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
+       write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 guid;
+       int ret;
+
+       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+       guid = ppd->guid;
+       if (!guid) {
+               if (dd->base_guid)
+                       guid = dd->base_guid + ppd->port - 1;
+               ppd->guid = guid;
+       }
+
+       /* Set linkinit_reason on power up per OPA spec */
+       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+       /* one-time init of the LCB */
+       init_lcb(dd);
+
+       if (loopback) {
+               ret = init_loopback(dd);
+               if (ret < 0)
+                       return ret;
+       }
+
+       get_port_type(ppd);
+       if (ppd->port_type == PORT_TYPE_QSFP) {
+               set_qsfp_int_n(ppd, 0);
+               wait_for_qsfp_init(ppd);
+               set_qsfp_int_n(ppd, 1);
+       }
+
+       /*
+        * Tune the SerDes to a ballpark setting for
+        * optimal signal and bit error rate
+        * Needs to be done before starting the link
+        */
+       tune_serdes(ppd);
+
+       return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Shut down the link and keep it down.   First turn off that the
+        * driver wants to allow the link to be up (driver_link_ready).
+        * Then make sure the link is not automatically restarted
+        * (link_enabled).  Cancel any pending restart.  And finally
+        * go offline.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+
+       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+                            OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       /* disable the port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+               ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+               if (!ppd->ibport_data.rvp.rc_acks ||
+                   !ppd->ibport_data.rvp.rc_delayed_comp ||
+                   !ppd->ibport_data.rvp.rc_qacks)
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static const char * const pt_names[] = {
+       "expected",
+       "eager",
+       "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order)
+{
+       u64 reg;
+       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+                             (dd->kregbase + RCV_ARRAY));
+
+       if (!(dd->flags & HFI1_PRESENT))
+               goto done;
+
+       if (type == PT_INVALID) {
+               pa = 0;
+       } else if (type > PT_INVALID) {
+               dd_dev_err(dd,
+                          "unexpected receive array type %u for index %u, not handled\n",
+                          type, index);
+               goto done;
+       }
+
+       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+                 pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
+       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+                                       << RCV_ARRAY_RT_ADDR_SHIFT;
+       writeq(reg, base + (index * 8));
+
+       if (type == PT_EAGER)
+               /*
+                * Eager entries are written one-by-one so we have to push them
+                * after we write the entry.
+                */
+               flush_wc();
+done:
+       return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 i;
+
+       /* this could be optimized */
+       for (i = rcd->eager_base; i < rcd->eager_base +
+                    rcd->egrbufs.alloced; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+       for (i = rcd->expected_base;
+                       i < rcd->expected_base + rcd->expected_count; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo)
+{
+       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+       return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+       return (struct hfi1_message_header *)
+               (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+       "HFI1_IB_CFG_LIDLMC",
+       "HFI1_IB_CFG_LWID_DG_ENB",
+       "HFI1_IB_CFG_LWID_ENB",
+       "HFI1_IB_CFG_LWID",
+       "HFI1_IB_CFG_SPD_ENB",
+       "HFI1_IB_CFG_SPD",
+       "HFI1_IB_CFG_RXPOL_ENB",
+       "HFI1_IB_CFG_LREV_ENB",
+       "HFI1_IB_CFG_LINKLATENCY",
+       "HFI1_IB_CFG_HRTBT",
+       "HFI1_IB_CFG_OP_VLS",
+       "HFI1_IB_CFG_VL_HIGH_CAP",
+       "HFI1_IB_CFG_VL_LOW_CAP",
+       "HFI1_IB_CFG_OVERRUN_THRESH",
+       "HFI1_IB_CFG_PHYERR_THRESH",
+       "HFI1_IB_CFG_LINKDEFAULT",
+       "HFI1_IB_CFG_PKEYS",
+       "HFI1_IB_CFG_MTU",
+       "HFI1_IB_CFG_LSTATE",
+       "HFI1_IB_CFG_VL_HIGH_LIMIT",
+       "HFI1_IB_CFG_PMA_TICKS",
+       "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+               return "invalid";
+       return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int val = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+               val = ppd->link_width_enabled;
+               break;
+       case HFI1_IB_CFG_LWID: /* currently active Link-width */
+               val = ppd->link_width_active;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               val = ppd->link_speed_enabled;
+               break;
+       case HFI1_IB_CFG_SPD: /* current Link speed */
+               val = ppd->link_speed_active;
+               break;
+
+       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+       case HFI1_IB_CFG_LINKLATENCY:
+               goto unimplemented;
+
+       case HFI1_IB_CFG_OP_VLS:
+               val = ppd->vls_operational;
+               break;
+       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               val = ppd->overrun_threshold;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               val = ppd->phy_error_threshold;
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               val = dd->link_default;
+               break;
+
+       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+       case HFI1_IB_CFG_PMA_TICKS:
+       default:
+unimplemented:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(
+                               dd,
+                               "%s: which %s: not implemented\n",
+                               __func__,
+                               ib_cfg_name(which));
+               break;
+       }
+
+       return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+       /*
+        * The maximum non-payload (MTU) bytes in LRH.PktLen are
+        * the Receive Header Entry Size minus the PBC (or RHF) size
+        * plus one DW for the ICRC appended by HW.
+        *
+        * dd->rcd[0].rcvhdrqentsize is in DW.
+        * We use rcd[0] as all context will have the same value. Also,
+        * the first kernel context would have been allocated by now so
+        * we are guaranteed a valid value.
+        */
+       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
+       u32 maxvlmtu = dd->vld[15].mtu;
+       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+       int i;
+       u32 thres;
+
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if (dd->vld[i].mtu > maxvlmtu)
+                       maxvlmtu = dd->vld[i].mtu;
+               if (i <= 3)
+                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+               else
+                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+       }
+       write_csr(dd, SEND_LEN_CHECK0, len1);
+       write_csr(dd, SEND_LEN_CHECK1, len2);
+       /* adjust kernel credit return thresholds based on new MTUs */
+       /* all kernel receive contexts have the same hdrqentsize */
+       for (i = 0; i < ppd->vls_supported; i++) {
+               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+                           sc_mtu_to_threshold(dd->vld[i].sc,
+                                               dd->vld[i].mtu,
+                                               dd->rcd[0]->rcvhdrqentsize));
+               sc_set_cr_threshold(dd->vld[i].sc, thres);
+       }
+       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+                   sc_mtu_to_threshold(dd->vld[15].sc,
+                                       dd->vld[15].mtu,
+                                       dd->rcd[0]->rcvhdrqentsize));
+       sc_set_cr_threshold(dd->vld[15].sc, thres);
+
+       /* Adjust maximum MTU for the port in DC */
+       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+               (ilog2(maxvlmtu >> 8) + 1);
+       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+       int i;
+       u64 sreg = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 mask = ~((1U << ppd->lmc) - 1);
+       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+       if (dd->hfi1_snoop.mode_flag)
+               dd_dev_info(dd, "Set lid/lmc while snooping");
+
+       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
+             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+       /*
+        * Iterate over all the send contexts and set their SLID check
+        */
+       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+       }
+
+       /* Now we have to do the same thing for the sdma engines */
+       sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+       unsigned long timeout;
+       u32 curr_state;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               curr_state = read_physical_state(dd);
+               if (curr_state == state)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+                                  state, curr_state);
+                       return -ETIMEDOUT;
+               }
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+
+       return 0;
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 pstate, previous_state;
+       u32 last_local_state;
+       u32 last_remote_state;
+       int ret;
+       int do_transition;
+       int do_wait;
+
+       previous_state = ppd->host_link_state;
+       ppd->host_link_state = HLS_GOING_OFFLINE;
+       pstate = read_physical_state(dd);
+       if (pstate == PLS_OFFLINE) {
+               do_transition = 0;      /* in right state */
+               do_wait = 0;            /* ...no need to wait */
+       } else if ((pstate & 0xff) == PLS_OFFLINE) {
+               do_transition = 0;      /* in an offline transient state */
+               do_wait = 1;            /* ...wait for it to settle */
+       } else {
+               do_transition = 1;      /* need to move to offline */
+               do_wait = 1;            /* ...will need to wait */
+       }
+
+       if (do_transition) {
+               ret = set_physical_link_state(dd,
+                                             (rem_reason << 8) | PLS_OFFLINE);
+
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to Offline link state, return %d\n",
+                                  ret);
+                       return -EINVAL;
+               }
+               if (ppd->offline_disabled_reason ==
+                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+                       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+       }
+
+       if (do_wait) {
+               /* it can take a while for the link to go down */
+               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /* make sure the logical state is also down */
+       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+       /*
+        * Now in charge of LCB - must be after the physical state is
+        * offline.quiet and before host_link_state is changed.
+        */
+       set_host_lcb_access(dd);
+       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+       if (ppd->port_type == PORT_TYPE_QSFP &&
+           ppd->qsfp_info.limiting_active &&
+           qsfp_mod_present(ppd)) {
+               int ret;
+
+               ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
+               if (ret == 0) {
+                       set_qsfp_tx(ppd, 0);
+                       release_chip_resource(dd, qsfp_resource(dd));
+               } else {
+                       /* not fatal, but should warn */
+                       dd_dev_err(dd,
+                                  "Unable to acquire lock to turn off QSFP TX\n");
+               }
+       }
+
+       /*
+        * The LNI has a mandatory wait time after the physical state
+        * moves to Offline.Quiet.  The wait time may be different
+        * depending on how the link went down.  The 8051 firmware
+        * will observe the needed wait time and only move to ready
+        * when that is completed.  The largest of the quiet timeouts
+        * is 6s, so wait that long and then at least 0.5s more for
+        * other transitions, and another 0.5s for a buffer.
+        */
+       ret = wait_fm_ready(dd, 7000);
+       if (ret) {
+               dd_dev_err(dd,
+                          "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+               /* state is really offline, so make it so */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               return ret;
+       }
+
+       /*
+        * The state is now offline and the 8051 is ready to accept host
+        * requests.
+        *      - change our state
+        *      - notify others if we were previously in a linkup state
+        */
+       ppd->host_link_state = HLS_DN_OFFLINE;
+       if (previous_state & HLS_UP) {
+               /* went down while link was up */
+               handle_linkup_change(dd, 0);
+       } else if (previous_state
+                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+               /* went down while attempting link up */
+               /* byte 1 of last_*_state is the failure reason */
+               read_last_local_state(dd, &last_local_state);
+               read_last_remote_state(dd, &last_remote_state);
+               dd_dev_err(dd,
+                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+                          last_local_state, last_remote_state);
+       }
+
+       /* the active link width (downgrade) is 0 on link down */
+       ppd->link_width_active = 0;
+       ppd->link_width_downgrade_tx_active = 0;
+       ppd->link_width_downgrade_rx_active = 0;
+       ppd->current_egress_rate = 0;
+       return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+       const char *name;
+       int n = ilog2(state);
+       static const char * const names[] = {
+               [__HLS_UP_INIT_BP]       = "INIT",
+               [__HLS_UP_ARMED_BP]      = "ARMED",
+               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
+               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
+               [__HLS_DN_POLL_BP]       = "POLL",
+               [__HLS_DN_DISABLE_BP]    = "DISABLE",
+               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
+               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
+               [__HLS_GOING_UP_BP]      = "GOING_UP",
+               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+       };
+
+       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+       return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+       if (state == HLS_UP_INIT) {
+               switch (ppd->linkinit_reason) {
+               case OPA_LINKINIT_REASON_LINKUP:
+                       return "(LINKUP)";
+               case OPA_LINKINIT_REASON_FLAPPING:
+                       return "(FLAPPING)";
+               case OPA_LINKINIT_OUTSIDE_POLICY:
+                       return "(OUTSIDE_POLICY)";
+               case OPA_LINKINIT_QUARANTINED:
+                       return "(QUARANTINED)";
+               case OPA_LINKINIT_INSUFIC_CAPABILITY:
+                       return "(INSUFIC_CAPABILITY)";
+               default:
+                       break;
+               }
+       }
+       return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+       switch (ppd->host_link_state) {
+       case HLS_UP_INIT:
+       case HLS_UP_ARMED:
+       case HLS_UP_ACTIVE:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case HLS_DN_POLL:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_DN_DISABLE:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case HLS_DN_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_VERIFY_CAP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_UP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_LINK_COOLDOWN:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_DN_DOWNDEF:
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+               return  -1;
+       }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+       if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
+               return IB_PORT_DOWN;
+
+       switch (ppd->host_link_state & HLS_UP) {
+       case HLS_UP_INIT:
+               return IB_PORT_INIT;
+       case HLS_UP_ARMED:
+               return IB_PORT_ARMED;
+       case HLS_UP_ACTIVE:
+               return IB_PORT_ACTIVE;
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+       return -1;
+       }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason)
+{
+       if (ppd->local_link_down_reason.latest == 0 &&
+           ppd->neigh_link_down_reason.latest == 0) {
+               ppd->local_link_down_reason.latest = lcl_reason;
+               ppd->neigh_link_down_reason.latest = neigh_reason;
+               ppd->remote_link_down_reason = rem_reason;
+       }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct ib_event event = {.device = NULL};
+       int ret1, ret = 0;
+       int orig_new_state, poll_bounce;
+
+       mutex_lock(&ppd->hls_lock);
+
+       orig_new_state = state;
+       if (state == HLS_DN_DOWNDEF)
+               state = dd->link_default;
+
+       /* interpret poll -> poll as a link bounce */
+       poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
+                     state == HLS_DN_POLL;
+
+       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+                   link_state_name(ppd->host_link_state),
+                   link_state_name(orig_new_state),
+                   poll_bounce ? "(bounce) " : "",
+                   link_state_reason_name(ppd, state));
+
+       /*
+        * If we're going to a (HLS_*) link state that implies the logical
+        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+        * reset is_sm_config_started to 0.
+        */
+       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+               ppd->is_sm_config_started = 0;
+
+       /*
+        * Do nothing if the states match.  Let a poll to poll link bounce
+        * go through.
+        */
+       if (ppd->host_link_state == state && !poll_bounce)
+               goto done;
+
+       switch (state) {
+       case HLS_UP_INIT:
+               if (ppd->host_link_state == HLS_DN_POLL &&
+                   (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+                       /*
+                        * Quick link up jumps from polling to here.
+                        *
+                        * Whether in normal or loopback mode, the
+                        * simulator jumps from polling to link up.
+                        * Accept that here.
+                        */
+                       /* OK */
+               } else if (ppd->host_link_state != HLS_GOING_UP) {
+                       goto unexpected;
+               }
+
+               ppd->host_link_state = HLS_UP_INIT;
+               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at going_up */
+                       ppd->host_link_state = HLS_GOING_UP;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to INIT\n",
+                                  __func__);
+               } else {
+                       /* clear old transient LINKINIT_REASON code */
+                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+                               ppd->linkinit_reason =
+                                       OPA_LINKINIT_REASON_LINKUP;
+
+                       /* enable the port */
+                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+                       handle_linkup_change(dd, 1);
+               }
+               break;
+       case HLS_UP_ARMED:
+               if (ppd->host_link_state != HLS_UP_INIT)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ARMED;
+               set_logical_state(dd, LSTATE_ARMED);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at init */
+                       ppd->host_link_state = HLS_UP_INIT;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to ARMED\n",
+                                  __func__);
+               }
+               /*
+                * The simulator does not currently implement SMA messages,
+                * so neighbor_normal is not set.  Set it here when we first
+                * move to Armed.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+                       ppd->neighbor_normal = 1;
+               break;
+       case HLS_UP_ACTIVE:
+               if (ppd->host_link_state != HLS_UP_ARMED)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ACTIVE;
+               set_logical_state(dd, LSTATE_ACTIVE);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at armed */
+                       ppd->host_link_state = HLS_UP_ARMED;
+                       dd_dev_err(dd,
+                                  "%s: logical state did not change to ACTIVE\n",
+                                  __func__);
+               } else {
+                       /* tell all engines to go running */
+                       sdma_all_running(dd);
+
+                       /* Signal the IB layer that the port has went active */
+                       event.device = &dd->verbs_dev.rdi.ibdev;
+                       event.element.port_num = ppd->port;
+                       event.event = IB_EVENT_PORT_ACTIVE;
+               }
+               break;
+       case HLS_DN_POLL:
+               if ((ppd->host_link_state == HLS_DN_DISABLE ||
+                    ppd->host_link_state == HLS_DN_OFFLINE) &&
+                   dd->dc_shutdown)
+                       dc_start(dd);
+               /* Hand LED control to the DC */
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       u8 tmp = ppd->link_enabled;
+
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret) {
+                               ppd->link_enabled = tmp;
+                               break;
+                       }
+                       ppd->remote_link_down_reason = 0;
+
+                       if (ppd->driver_link_ready)
+                               ppd->link_enabled = 1;
+               }
+
+               set_all_slowpath(ppd->dd);
+               ret = set_local_link_attributes(ppd);
+               if (ret)
+                       break;
+
+               ppd->port_error_action = 0;
+               ppd->host_link_state = HLS_DN_POLL;
+
+               if (quick_linkup) {
+                       /* quick linkup does not go into polling */
+                       ret = do_quick_linkup(dd);
+               } else {
+                       ret1 = set_physical_link_state(dd, PLS_POLLING);
+                       if (ret1 != HCMD_SUCCESS) {
+                               dd_dev_err(dd,
+                                          "Failed to transition to Polling link state, return 0x%x\n",
+                                          ret1);
+                               ret = -EINVAL;
+                       }
+               }
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+               /*
+                * If an error occurred above, go back to offline.  The
+                * caller may reschedule another attempt.
+                */
+               if (ret)
+                       goto_offline(ppd, 0);
+               break;
+       case HLS_DN_DISABLE:
+               /* link is disabled */
+               ppd->link_enabled = 0;
+
+               /* allow any state to transition to disabled */
+
+               /* must transition to offline first */
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret)
+                               break;
+                       ppd->remote_link_down_reason = 0;
+               }
+
+               ret1 = set_physical_link_state(dd, PLS_DISABLED);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to Disabled link state, return 0x%x\n",
+                                  ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_DN_DISABLE;
+               dc_shutdown(dd);
+               break;
+       case HLS_DN_OFFLINE:
+               if (ppd->host_link_state == HLS_DN_DISABLE)
+                       dc_start(dd);
+
+               /* allow any state to transition to offline */
+               ret = goto_offline(ppd, ppd->remote_link_down_reason);
+               if (!ret)
+                       ppd->remote_link_down_reason = 0;
+               break;
+       case HLS_VERIFY_CAP:
+               if (ppd->host_link_state != HLS_DN_POLL)
+                       goto unexpected;
+               ppd->host_link_state = HLS_VERIFY_CAP;
+               break;
+       case HLS_GOING_UP:
+               if (ppd->host_link_state != HLS_VERIFY_CAP)
+                       goto unexpected;
+
+               ret1 = set_physical_link_state(dd, PLS_LINKUP);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                                  "Failed to transition to link up state, return 0x%x\n",
+                                  ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_GOING_UP;
+               break;
+
+       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
+       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
+       default:
+               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+                           __func__, state);
+               ret = -EINVAL;
+               break;
+       }
+
+       goto done;
+
+unexpected:
+       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+                  __func__, link_state_name(ppd->host_link_state),
+                  link_state_name(state));
+       ret = -EINVAL;
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (event.device)
+               ib_dispatch_event(&event);
+
+       return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+       u64 reg;
+       int ret = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LIDLMC:
+               set_lidlmc(ppd);
+               break;
+       case HFI1_IB_CFG_VL_HIGH_LIMIT:
+               /*
+                * The VL Arbitrator high limit is sent in units of 4k
+                * bytes, while HFI stores it in units of 64 bytes.
+                */
+               val *= 4096 / 64;
+               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               /* HFI only supports POLL as the default link down state */
+               if (val != HLS_DN_POLL)
+                       ret = -EINVAL;
+               break;
+       case HFI1_IB_CFG_OP_VLS:
+               if (ppd->vls_operational != val) {
+                       ppd->vls_operational = val;
+                       if (!ppd->port)
+                               ret = -EINVAL;
+               }
+               break;
+       /*
+        * For link width, link width downgrade, and speed enable, always AND
+        * the setting with what is actually supported.  This has two benefits.
+        * First, enabled can't have unsupported values, no matter what the
+        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+        * "fill in with your supported value" have all the bits in the
+        * field set, so simply ANDing with supported has the desired result.
+        */
+       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+               ppd->link_width_enabled = val & ppd->link_width_supported;
+               break;
+       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+               ppd->link_width_downgrade_enabled =
+                               val & ppd->link_width_downgrade_supported;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               ppd->link_speed_enabled = val & ppd->link_speed_supported;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->overrun_threshold = val;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->phy_error_threshold = val;
+               break;
+
+       case HFI1_IB_CFG_MTU:
+               set_send_length(ppd);
+               break;
+
+       case HFI1_IB_CFG_PKEYS:
+               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+                       set_partition_keys(ppd);
+               break;
+
+       default:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(ppd->dd,
+                                   "%s: which %s, val 0x%x: not implemented\n",
+                                   __func__, ib_cfg_name(which), val);
+               break;
+       }
+       return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+       int i;
+
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_LOW_PRIO_TABLE_SIZE);
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+       /*
+        * Note that we always return values directly from the
+        * 'vl_arb_cache' (and do no CSR reads) in response to a
+        * 'Get(VLArbTable)'. This is obviously correct after a
+        * 'Set(VLArbTable)', since the cache will then be up to
+        * date. But it's also correct prior to any 'Set(VLArbTable)'
+        * since then both the cache, and the relevant h/w registers
+        * will be zeroed.
+        */
+
+       for (i = 0; i < MAX_PRIO_TABLE; i++)
+               spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+               return NULL;
+       spin_lock(&ppd->vl_arb_cache[idx].lock);
+       return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+                             struct ib_vl_weight_elem *vl)
+{
+       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+                         u32 size, struct ib_vl_weight_elem *vl)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       unsigned int i, is_up = 0;
+       int drain, ret = 0;
+
+       mutex_lock(&ppd->hls_lock);
+
+       if (ppd->host_link_state & HLS_UP)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * Before adjusting VL arbitration weights, empty per-VL
+                * FIFOs, otherwise a packet whose VL weight is being
+                * set to 0 could get stuck in a FIFO with no chance to
+                * egress.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(
+                       dd,
+                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+                       __func__);
+               goto err;
+       }
+
+       for (i = 0; i < size; i++, vl++) {
+               /*
+                * NOTE: The low priority shift and mask are used here, but
+                * they are the same for both the low and high registers.
+                */
+               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+                     | (((u64)vl->weight
+                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+               write_csr(dd, target + (i * 8), reg);
+       }
+       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+                          struct vl_limit *vll)
+{
+       u64 reg = read_csr(dd, csr);
+
+       vll->dedicated = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+       vll->shared = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *bc, u16 *overall_limit)
+{
+       u64 reg;
+       int i;
+
+       /* not all entries are filled in */
+       memset(bc, 0, sizeof(*bc));
+
+       /* OPA and HFI have a 1-1 mapping */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
+
+       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       bc->overall_shared_limit = cpu_to_be16(
+               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+       if (overall_limit)
+               *overall_limit = (reg
+                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+       return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       u64 reg;
+       int i;
+
+       /* each register contains 16 SC->VLnt mappings, 4 bits each */
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[2 * i] = byte & 0xf;
+               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[16 + (2 * i)] = byte & 0xf;
+               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+       return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+                             struct ib_vl_weight_elem *vl)
+{
+       unsigned int i;
+
+       for (i = 0; i < nelems; i++, vl++) {
+               vl->vl = 0xf;
+               vl->weight = 0;
+       }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+                 DC_SC_VL_VAL(15_0,
+                              0, dp->vlnt[0] & 0xf,
+                              1, dp->vlnt[1] & 0xf,
+                              2, dp->vlnt[2] & 0xf,
+                              3, dp->vlnt[3] & 0xf,
+                              4, dp->vlnt[4] & 0xf,
+                              5, dp->vlnt[5] & 0xf,
+                              6, dp->vlnt[6] & 0xf,
+                              7, dp->vlnt[7] & 0xf,
+                              8, dp->vlnt[8] & 0xf,
+                              9, dp->vlnt[9] & 0xf,
+                              10, dp->vlnt[10] & 0xf,
+                              11, dp->vlnt[11] & 0xf,
+                              12, dp->vlnt[12] & 0xf,
+                              13, dp->vlnt[13] & 0xf,
+                              14, dp->vlnt[14] & 0xf,
+                              15, dp->vlnt[15] & 0xf));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+                 DC_SC_VL_VAL(31_16,
+                              16, dp->vlnt[16] & 0xf,
+                              17, dp->vlnt[17] & 0xf,
+                              18, dp->vlnt[18] & 0xf,
+                              19, dp->vlnt[19] & 0xf,
+                              20, dp->vlnt[20] & 0xf,
+                              21, dp->vlnt[21] & 0xf,
+                              22, dp->vlnt[22] & 0xf,
+                              23, dp->vlnt[23] & 0xf,
+                              24, dp->vlnt[24] & 0xf,
+                              25, dp->vlnt[25] & 0xf,
+                              26, dp->vlnt[26] & 0xf,
+                              27, dp->vlnt[27] & 0xf,
+                              28, dp->vlnt[28] & 0xf,
+                              29, dp->vlnt[29] & 0xf,
+                              30, dp->vlnt[30] & 0xf,
+                              31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+                       u16 limit)
+{
+       if (limit != 0)
+               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+                           what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+                                    const char *which)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+               if (reg == 0)
+                       return; /* success */
+               if (time_after(jiffies, timeout))
+                       break;          /* timed out */
+               udelay(1);
+       }
+
+       dd_dev_err(dd,
+                  "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+                  which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+       /*
+        * If this occurs, it is likely there was a credit loss on the link.
+        * The only recovery from that is a link bounce.
+        */
+       dd_dev_err(dd,
+                  "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *     earlier in the algorithm), set the new limit to the new value
+ */
+int set_buffer_control(struct hfi1_pportdata *ppd,
+                      struct buffer_control *new_bc)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 changing_mask, ld_mask, stat_mask;
+       int change_count;
+       int i, use_all_mask;
+       int this_shared_changing;
+       int vl_count = 0, ret;
+       /*
+        * A0: add the variable any_shared_limit_changing below and in the
+        * algorithm above.  If removing A0 support, it can be removed.
+        */
+       int any_shared_limit_changing;
+       struct buffer_control cur_bc;
+       u8 changing[OPA_MAX_VLS];
+       u8 lowering_dedicated[OPA_MAX_VLS];
+       u16 cur_total;
+       u32 new_total = 0;
+       const u64 all_mask =
+       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16      /* look at VL15 and less */
+
+       /* find the new total credits, do sanity check on unused VLs */
+       for (i = 0; i < OPA_MAX_VLS; i++) {
+               if (valid_vl(i)) {
+                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+                       continue;
+               }
+               nonzero_msg(dd, i, "dedicated",
+                           be16_to_cpu(new_bc->vl[i].dedicated));
+               nonzero_msg(dd, i, "shared",
+                           be16_to_cpu(new_bc->vl[i].shared));
+               new_bc->vl[i].dedicated = 0;
+               new_bc->vl[i].shared = 0;
+       }
+       new_total += be16_to_cpu(new_bc->overall_shared_limit);
+
+       /* fetch the current values */
+       get_buffer_control(dd, &cur_bc, &cur_total);
+
+       /*
+        * Create the masks we will use.
+        */
+       memset(changing, 0, sizeof(changing));
+       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+       /*
+        * NOTE: Assumes that the individual VL bits are adjacent and in
+        * increasing order
+        */
+       stat_mask =
+               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+       changing_mask = 0;
+       ld_mask = 0;
+       change_count = 0;
+       any_shared_limit_changing = 0;
+       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+               if (!valid_vl(i))
+                       continue;
+               this_shared_changing = new_bc->vl[i].shared
+                                               != cur_bc.vl[i].shared;
+               if (this_shared_changing)
+                       any_shared_limit_changing = 1;
+               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
+                   this_shared_changing) {
+                       changing[i] = 1;
+                       changing_mask |= stat_mask;
+                       change_count++;
+               }
+               if (be16_to_cpu(new_bc->vl[i].dedicated) <
+                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
+                       lowering_dedicated[i] = 1;
+                       ld_mask |= stat_mask;
+               }
+       }
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total > cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Start the credit change algorithm.
+        */
+       use_all_mask = 0;
+       if ((be16_to_cpu(new_bc->overall_shared_limit) <
+            be16_to_cpu(cur_bc.overall_shared_limit)) ||
+           (is_ax(dd) && any_shared_limit_changing)) {
+               set_global_shared(dd, 0);
+               cur_bc.overall_shared_limit = 0;
+               use_all_mask = 1;
+       }
+
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (changing[i]) {
+                       set_vl_shared(dd, i, 0);
+                       cur_bc.vl[i].shared = 0;
+               }
+       }
+
+       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+                                "shared");
+
+       if (change_count > 0) {
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (lowering_dedicated[i]) {
+                               set_vl_dedicated(dd, i,
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
+                               cur_bc.vl[i].dedicated =
+                                               new_bc->vl[i].dedicated;
+                       }
+               }
+
+               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+               /* now raise all dedicated that are going up */
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
+                                       be16_to_cpu(cur_bc.vl[i].dedicated))
+                               set_vl_dedicated(dd, i,
+                                                be16_to_cpu(new_bc->
+                                                            vl[i].dedicated));
+               }
+       }
+
+       /* next raise all shared that are going up */
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (be16_to_cpu(new_bc->vl[i].shared) >
+                               be16_to_cpu(cur_bc.vl[i].shared))
+                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+       }
+
+       /* finally raise the global shared */
+       if (be16_to_cpu(new_bc->overall_shared_limit) >
+           be16_to_cpu(cur_bc.overall_shared_limit))
+               set_global_shared(dd,
+                                 be16_to_cpu(new_bc->overall_shared_limit));
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total < cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Determine the actual number of operational VLS using the number of
+        * dedicated and shared credits for each VL.
+        */
+       if (change_count > 0) {
+               for (i = 0; i < TXE_NUM_DATA_VL; i++)
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
+                           be16_to_cpu(new_bc->vl[i].shared) > 0)
+                               vl_count++;
+               ppd->actual_vls_operational = vl_count;
+               ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
+                                   ppd->actual_vls_operational :
+                                   ppd->vls_operational,
+                                   NULL);
+               if (ret == 0)
+                       ret = pio_map_init(dd, ppd->port - 1, vl_count ?
+                                          ppd->actual_vls_operational :
+                                          ppd->vls_operational, NULL);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+       int size;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               size = get_buffer_control(ppd->dd, t, NULL);
+               break;
+       case FM_TBL_SC2VLNT:
+               size = get_sc2vlnt(ppd->dd, t);
+               break;
+       case FM_TBL_VL_PREEMPT_ELEMS:
+               size = 256;
+               /* OPA specifies 128 elements, of 2 bytes each */
+               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+               break;
+       case FM_TBL_VL_PREEMPT_MATRIX:
+               size = 256;
+               /*
+                * OPA specifies that this is the same size as the VL
+                * arbitration tables (i.e., 256 bytes).
+                */
+               break;
+       default:
+               return -EINVAL;
+       }
+       return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+       int ret = 0;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               ret = set_buffer_control(ppd, t);
+               break;
+       case FM_TBL_SC2VLNT:
+               set_sc2vlnt(ppd->dd, t);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_ax(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+       return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_ax(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+       return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+       sc_wait(dd);
+       sdma_wait(dd);
+       pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = disable_data_vls(dd);
+       if (ret == 0)
+               drain_data_vls(dd);
+
+       return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+       u32 cclocks;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+       else  /* simulation pretends to be ASIC */
+               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
+               cclocks = 1;
+       return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+       u32 ns;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+       else  /* simulation pretends to be ASIC */
+               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+       if (cclocks && !ns)
+               ns = 1;
+       return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 timeout = rcd->rcvavail_timeout;
+
+       /*
+        * This algorithm doubles or halves the timeout depending on whether
+        * the number of packets received in this interrupt were less than or
+        * greater equal the interrupt count.
+        *
+        * The calculations below do not allow a steady state to be achieved.
+        * Only at the endpoints it is possible to have an unchanging
+        * timeout.
+        */
+       if (npkts < rcv_intr_count) {
+               /*
+                * Not enough packets arrived before the timeout, adjust
+                * timeout downward.
+                */
+               if (timeout < 2) /* already at minimum? */
+                       return;
+               timeout >>= 1;
+       } else {
+               /*
+                * More than enough packets arrived before the timeout, adjust
+                * timeout upward.
+                */
+               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+                       return;
+               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+       }
+
+       rcd->rcvavail_timeout = timeout;
+       /*
+        * timeout cannot be larger than rcv_intr_timeout_csr which has already
+        * been verified to be in range
+        */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+                       (u64)timeout <<
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+                   u32 intr_adjust, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u64 reg;
+       u32 ctxt = rcd->ctxt;
+
+       /*
+        * Need to write timeout register before updating RcvHdrHead to ensure
+        * that a new value is used when the HW decides to restart counting.
+        */
+       if (intr_adjust)
+               adjust_rcv_timeout(rcd, npkts);
+       if (updegr) {
+               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+       }
+       mmiowb();
+       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+                       << RCV_HDR_HEAD_HEAD_SHIFT);
+       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+       u32 head, tail;
+
+       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+       if (rcd->rcvhdrtail_kvaddr)
+               tail = get_rcvhdrtail(rcd);
+       else
+               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+       return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *     0x0 invalid
+ *     0x1   4 KB
+ *     0x2   8 KB
+ *     0x3  16 KB
+ *     0x4  32 KB
+ *     0x5  64 KB
+ *     0x6 128 KB
+ *     0x7 256 KB
+ *     0x8 512 KB (Receive Array only)
+ *     0x9   1 MB (Receive Array only)
+ *     0xa   2 MB (Receive Array only)
+ *
+ *     0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+       switch (size) {
+       case   4 * 1024: return 0x1;
+       case   8 * 1024: return 0x2;
+       case  16 * 1024: return 0x3;
+       case  32 * 1024: return 0x4;
+       case  64 * 1024: return 0x5;
+       case 128 * 1024: return 0x6;
+       case 256 * 1024: return 0x7;
+       case 512 * 1024: return 0x8;
+       case   1 * 1024 * 1024: return 0x9;
+       case   2 * 1024 * 1024: return 0xa;
+       }
+       return 0x1;     /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       u64 rcvctrl, reg;
+       int did_enable = 0;
+
+       rcd = dd->rcd[ctxt];
+       if (!rcd)
+               return;
+
+       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+       /* if the context already enabled, don't do the extra steps */
+       if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
+           !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+               /* reset the tail and hdr addresses, and sequence count */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+                               rcd->rcvhdrq_phys);
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       rcd->rcvhdrqtailaddr_phys);
+               rcd->seq_cnt = 1;
+
+               /* reset the cached receive header queue head value */
+               rcd->head = 0;
+
+               /*
+                * Zero the receive header queue so we don't get false
+                * positives when checking the sequence number.  The
+                * sequence numbers could land exactly on the same spot.
+                * E.g. a rcd restart before the receive header wrapped.
+                */
+               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+               /* starting timeout */
+               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+               /* enable the context */
+               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+               /* clean the egr buffer size first */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+               did_enable = 1;
+
+               /* zero RcvEgrIndexHead */
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+               /* set eager count and base index */
+               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+                       & RCV_EGR_CTRL_EGR_CNT_MASK)
+                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+                       (((rcd->eager_base >> RCV_SHIFT)
+                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+               /*
+                * Set TID (expected) count and base index.
+                * rcd->expected_count is set to individual RcvArray entries,
+                * not pairs, and the CSR takes a pair-count in groups of
+                * four, so divide by 8.
+                */
+               reg = (((rcd->expected_count >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+                     (((rcd->expected_base >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+               if (ctxt == HFI1_CTRL_CTXT)
+                       write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
+       }
+       if (op & HFI1_RCVCTRL_CTXT_DIS) {
+               write_csr(dd, RCV_VL15, 0);
+               /*
+                * When receive context is being disabled turn on tail
+                * update with a dummy tail address and then disable
+                * receive context.
+                */
+               if (dd->rcvhdrtail_dummy_physaddr) {
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       dd->rcvhdrtail_dummy_physaddr);
+                       /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
+                       rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+               }
+
+               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
+               /* See comment on RcvCtxtCtrl.TailUpd above */
+               if (!(op & HFI1_RCVCTRL_CTXT_DIS))
+                       rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+               /*
+                * In one-packet-per-eager mode, the size comes from
+                * the RcvArray entry.
+                */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       rcd->rcvctrl = rcvctrl;
+       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+       if (did_enable &&
+           (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+               if (reg != 0) {
+                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+                                   ctxt, reg);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+                                   ctxt, reg, reg == 0 ? "not" : "still");
+               }
+       }
+
+       if (did_enable) {
+               /*
+                * The interrupt timeout and count must be set after
+                * the context is enabled to take effect.
+                */
+               /* set interrupt timeout */
+               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+                               (u64)rcd->rcvavail_timeout <<
+                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       }
+
+       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+               /*
+                * If the context has been disabled and the Tail Update has
+                * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
+                * so it doesn't contain an address that is invalid.
+                */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                               dd->rcvhdrtail_dummy_physaddr);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->cntrnameslen;
+               *namep = dd->cntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = (dd->ndevcntrs) * sizeof(u64);
+
+               /* Get the start of the block of counters */
+               *cntrp = dd->cntrs;
+
+               /*
+                * Now go and fill in each counter in the block.
+                */
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                       } else {
+                               if (entry->flags & CNTR_VL) {
+                                       hfi1_cdbg(CNTR, "\tPer VL\n");
+                                       for (j = 0; j < C_VL_COUNT; j++) {
+                                               val = entry->rw_cntr(entry,
+                                                                 dd, j,
+                                                                 CNTR_MODE_R,
+                                                                 0);
+                                               hfi1_cdbg(
+                                                  CNTR,
+                                                  "\t\tRead 0x%llx for %d\n",
+                                                  val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                           val;
+                                       }
+                               } else if (entry->flags & CNTR_SDMA) {
+                                       hfi1_cdbg(CNTR,
+                                                 "\t Per SDMA Engine\n");
+                                       for (j = 0; j < dd->chip_sdma_engines;
+                                            j++) {
+                                               val =
+                                               entry->rw_cntr(entry, dd, j,
+                                                              CNTR_MODE_R, 0);
+                                               hfi1_cdbg(CNTR,
+                                                         "\t\tRead 0x%llx for %d\n",
+                                                         val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                       val;
+                                       }
+                               } else {
+                                       val = entry->rw_cntr(entry, dd,
+                                                       CNTR_INVALID_VL,
+                                                       CNTR_MODE_R, 0);
+                                       dd->cntrs[entry->offset] = val;
+                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                               }
+                       }
+               }
+       }
+       return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = ppd->dd->portcntrnameslen;
+               *namep = ppd->dd->portcntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = ppd->dd->nportcntrs * sizeof(u64);
+               *cntrp = ppd->cntrs;
+
+               for (i = 0; i < PORT_CNTR_LAST; i++) {
+                       entry = &port_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                               continue;
+                       }
+
+                       if (entry->flags & CNTR_VL) {
+                               hfi1_cdbg(CNTR, "\tPer VL");
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       val = entry->rw_cntr(entry, ppd, j,
+                                                              CNTR_MODE_R,
+                                                              0);
+                                       hfi1_cdbg(
+                                          CNTR,
+                                          "\t\tRead 0x%llx for %d",
+                                          val, j);
+                                       ppd->cntrs[entry->offset + j] = val;
+                               }
+                       } else {
+                               val = entry->rw_cntr(entry, ppd,
+                                                      CNTR_INVALID_VL,
+                                                      CNTR_MODE_R,
+                                                      0);
+                               ppd->cntrs[entry->offset] = val;
+                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                       }
+               }
+       }
+       return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       if (dd->synth_stats_timer.data)
+               del_timer_sync(&dd->synth_stats_timer);
+       dd->synth_stats_timer.data = 0;
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               kfree(ppd->cntrs);
+               kfree(ppd->scntrs);
+               free_percpu(ppd->ibport_data.rvp.rc_acks);
+               free_percpu(ppd->ibport_data.rvp.rc_qacks);
+               free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
+               ppd->cntrs = NULL;
+               ppd->scntrs = NULL;
+               ppd->ibport_data.rvp.rc_acks = NULL;
+               ppd->ibport_data.rvp.rc_qacks = NULL;
+               ppd->ibport_data.rvp.rc_delayed_comp = NULL;
+       }
+       kfree(dd->portcntrnames);
+       dd->portcntrnames = NULL;
+       kfree(dd->cntrs);
+       dd->cntrs = NULL;
+       kfree(dd->scntrs);
+       dd->scntrs = NULL;
+       kfree(dd->cntrnames);
+       dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+                             u64 *psval, void *context, int vl)
+{
+       u64 val;
+       u64 sval = *psval;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+       /* If its a synthetic counter there is more work we need to do */
+       if (entry->flags & CNTR_SYNTH) {
+               if (sval == CNTR_MAX) {
+                       /* No need to read already saturated */
+                       return CNTR_MAX;
+               }
+
+               if (entry->flags & CNTR_32BIT) {
+                       /* 32bit counters can wrap multiple times */
+                       u64 upper = sval >> 32;
+                       u64 lower = (sval << 32) >> 32;
+
+                       if (lower > val) { /* hw wrapped */
+                               if (upper == CNTR_32BIT_MAX)
+                                       val = CNTR_MAX;
+                               else
+                                       upper++;
+                       }
+
+                       if (val != CNTR_MAX)
+                               val = (upper << 32) | val;
+
+               } else {
+                       /* If we rolled we are saturated */
+                       if ((val < sval) || (val > CNTR_MAX))
+                               val = CNTR_MAX;
+               }
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+                              struct cntr_entry *entry,
+                              u64 *psval, void *context, int vl, u64 data)
+{
+       u64 val;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       if (entry->flags & CNTR_SYNTH) {
+               *psval = data;
+               if (entry->flags & CNTR_32BIT) {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            (data << 32) >> 32);
+                       val = data; /* return the full 64bit value */
+               } else {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            data);
+               }
+       } else {
+               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+       u64 cur_tx;
+       u64 cur_rx;
+       u64 total_flits;
+       u8 update = 0;
+       int i, j, vl;
+       struct hfi1_pportdata *ppd;
+       struct cntr_entry *entry;
+
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+       /*
+        * Rather than keep beating on the CSRs pick a minimal set that we can
+        * check to watch for potential roll over. We can do this by looking at
+        * the number of flits sent/recv. If the total flits exceeds 32bits then
+        * we have to iterate all the counters and update.
+        */
+       entry = &dev_cntrs[C_DC_RCV_FLITS];
+       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       entry = &dev_cntrs[C_DC_XMIT_FLITS];
+       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       hfi1_cdbg(
+           CNTR,
+           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+               /*
+                * May not be strictly necessary to update but it won't hurt and
+                * simplifies the logic here.
+                */
+               update = 1;
+               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+                         dd->unit);
+       } else {
+               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+               hfi1_cdbg(CNTR,
+                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+                         total_flits, (u64)CNTR_32BIT_MAX);
+               if (total_flits >= CNTR_32BIT_MAX) {
+                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+                                 dd->unit);
+                       update = 1;
+               }
+       }
+
+       if (update) {
+               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       if (entry->flags & CNTR_VL) {
+                               for (vl = 0; vl < C_VL_COUNT; vl++)
+                                       read_dev_cntr(dd, i, vl);
+                       } else {
+                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
+                       }
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1);
+               for (i = 0; i < dd->num_pports; i++, ppd++) {
+                       for (j = 0; j < PORT_CNTR_LAST; j++) {
+                               entry = &port_cntrs[j];
+                               if (entry->flags & CNTR_VL) {
+                                       for (vl = 0; vl < C_VL_COUNT; vl++)
+                                               read_port_cntr(ppd, j, vl);
+                               } else {
+                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
+                               }
+                       }
+               }
+
+               /*
+                * We want the value in the register. The goal is to keep track
+                * of the number of "ticks" not the counter value. In other
+                * words if the register rolls we want to notice it and go ahead
+                * and force an update.
+                */
+               entry = &dev_cntrs[C_DC_XMIT_FLITS];
+               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               entry = &dev_cntrs[C_DC_RCV_FLITS];
+               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+                         dd->unit, dd->last_tx, dd->last_rx);
+
+       } else {
+               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+       }
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+       int i, rcv_ctxts, j;
+       size_t sz;
+       char *p;
+       char name[C_MAX_NAME];
+       struct hfi1_pportdata *ppd;
+       const char *bit_type_32 = ",32";
+       const int bit_type_32_sz = strlen(bit_type_32);
+
+       /* set up the stats timer; the add_timer is done at the end */
+       setup_timer(&dd->synth_stats_timer, update_synth_timer,
+                   (unsigned long)dd);
+
+       /***********************/
+       /* per device counters */
+       /***********************/
+
+       /* size names and determine how many we have*/
+       dd->ndevcntrs = 0;
+       sz = 0;
+
+       for (i = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+                       continue;
+               }
+
+               if (dev_cntrs[i].flags & CNTR_VL) {
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, vl_from_idx(j));
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->ndevcntrs++;
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (dev_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->ndevcntrs++;
+                       }
+               } else {
+                       /* +1 for newline. */
+                       sz += strlen(dev_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (dev_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
+                       dev_cntrs[i].offset = dd->ndevcntrs;
+                       dd->ndevcntrs++;
+               }
+       }
+
+       /* allocate space for the counter values */
+       dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+       if (!dd->cntrs)
+               goto bail;
+
+       dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+       if (!dd->scntrs)
+               goto bail;
+
+       /* allocate space for the counter names */
+       dd->cntrnameslen = sz;
+       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->cntrnames)
+               goto bail;
+
+       /* fill in the names */
+       for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       /* Nothing */
+               } else if (dev_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name,
+                                        vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+                       for (j = 0; j < dd->chip_sdma_engines; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        dev_cntrs[i].name, j);
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (dev_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
+                       p += strlen(dev_cntrs[i].name);
+
+                       /* Counter is 32 bits */
+                       if (dev_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
+
+                       *p++ = '\n';
+               }
+       }
+
+       /*********************/
+       /* per port counters */
+       /*********************/
+
+       /*
+        * Go through the counters for the overflows and disable the ones we
+        * don't need. This varies based on platform so we need to do it
+        * dynamically here.
+        */
+       rcv_ctxts = dd->num_rcv_contexts;
+       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+            i <= C_RCV_HDR_OVF_LAST; i++) {
+               port_cntrs[i].flags |= CNTR_DISABLED;
+       }
+
+       /* size port counter names and determine how many we have*/
+       sz = 0;
+       dd->nportcntrs = 0;
+       for (i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+                       continue;
+               }
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        port_cntrs[i].name, vl_from_idx(j));
+                               sz += strlen(name);
+                               /* Add ",32" for 32-bit counters */
+                               if (port_cntrs[i].flags & CNTR_32BIT)
+                                       sz += bit_type_32_sz;
+                               sz++;
+                               dd->nportcntrs++;
+                       }
+               } else {
+                       /* +1 for newline */
+                       sz += strlen(port_cntrs[i].name) + 1;
+                       /* Add ",32" for 32-bit counters */
+                       if (port_cntrs[i].flags & CNTR_32BIT)
+                               sz += bit_type_32_sz;
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       dd->nportcntrs++;
+               }
+       }
+
+       /* allocate space for the counter names */
+       dd->portcntrnameslen = sz;
+       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->portcntrnames)
+               goto bail;
+
+       /* fill in port cntr names */
+       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED)
+                       continue;
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                        port_cntrs[i].name, vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+
+                               /* Counter is 32 bits */
+                               if (port_cntrs[i].flags & CNTR_32BIT) {
+                                       memcpy(p, bit_type_32, bit_type_32_sz);
+                                       p += bit_type_32_sz;
+                               }
+
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, port_cntrs[i].name,
+                              strlen(port_cntrs[i].name));
+                       p += strlen(port_cntrs[i].name);
+
+                       /* Counter is 32 bits */
+                       if (port_cntrs[i].flags & CNTR_32BIT) {
+                               memcpy(p, bit_type_32, bit_type_32_sz);
+                               p += bit_type_32_sz;
+                       }
+
+                       *p++ = '\n';
+               }
+       }
+
+       /* allocate per port storage for counter values */
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->cntrs)
+                       goto bail;
+
+               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->scntrs)
+                       goto bail;
+       }
+
+       /* CPU counters need to be allocated and zeroed */
+       if (init_cpu_counters(dd))
+               goto bail;
+
+       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+       return 0;
+bail:
+       free_cntrs(dd);
+       return -ENOMEM;
+}
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       switch (chip_lstate) {
+       default:
+               dd_dev_err(dd,
+                          "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+                          chip_lstate);
+               /* fall through */
+       case LSTATE_DOWN:
+               return IB_PORT_DOWN;
+       case LSTATE_INIT:
+               return IB_PORT_INIT;
+       case LSTATE_ARMED:
+               return IB_PORT_ARMED;
+       case LSTATE_ACTIVE:
+               return IB_PORT_ACTIVE;
+       }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+       /* look at the HFI meta-states only */
+       switch (chip_pstate & 0xf0) {
+       default:
+               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+                          chip_pstate);
+               /* fall through */
+       case PLS_DISABLED:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case PLS_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case PLS_POLLING:
+               return IB_PORTPHYSSTATE_POLLING;
+       case PLS_CONFIGPHY:
+               return IB_PORTPHYSSTATE_TRAINING;
+       case PLS_LINKUP:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case PLS_PHYTEST:
+               return IB_PORTPHYSSTATE_PHY_TEST;
+       }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+       static const char * const port_logical_names[] = {
+               "PORT_NOP",
+               "PORT_DOWN",
+               "PORT_INIT",
+               "PORT_ARMED",
+               "PORT_ACTIVE",
+               "PORT_ACTIVE_DEFER",
+       };
+       if (lstate < ARRAY_SIZE(port_logical_names))
+               return port_logical_names[lstate];
+       return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+       static const char * const port_physical_names[] = {
+               "PHYS_NOP",
+               "reserved1",
+               "PHYS_POLL",
+               "PHYS_DISABLED",
+               "PHYS_TRAINING",
+               "PHYS_LINKUP",
+               "PHYS_LINK_ERR_RECOVER",
+               "PHYS_PHY_TEST",
+               "reserved8",
+               "PHYS_OFFLINE",
+               "PHYS_GANGED",
+               "PHYS_TEST",
+       };
+       if (pstate < ARRAY_SIZE(port_physical_names))
+               return port_physical_names[pstate];
+       return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+       u32 new_state;
+
+       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+       if (new_state != ppd->lstate) {
+               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+                           opa_lstate_name(new_state), new_state);
+               ppd->lstate = new_state;
+       }
+       /*
+        * Set port status flags in the page mapped into userspace
+        * memory. Do it here to ensure a reliable state - this is
+        * the only function called by all state handling code.
+        * Always set the flags due to the fact that the cache value
+        * might have been changed explicitly outside of this
+        * function.
+        */
+       if (ppd->statusp) {
+               switch (ppd->lstate) {
+               case IB_PORT_DOWN:
+               case IB_PORT_INIT:
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+                       break;
+               case IB_PORT_ARMED:
+                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
+                       break;
+               case IB_PORT_ACTIVE:
+                       *ppd->statusp |= HFI1_STATUS_IB_READY;
+                       break;
+               }
+       }
+       return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs)
+{
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               if (get_logical_state(ppd) == state)
+                       return 0;
+               if (time_after(jiffies, timeout))
+                       break;
+               msleep(20);
+       }
+       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+       return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+       u32 pstate;
+       u32 ib_pstate;
+
+       pstate = read_physical_state(ppd->dd);
+       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+       if (ppd->last_pstate != ib_pstate) {
+               dd_dev_info(ppd->dd,
+                           "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+                           __func__, opa_pstate_name(ib_pstate), ib_pstate,
+                           pstate);
+               ppd->last_pstate = ib_pstate;
+       }
+       return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ *      I2CCLK  (bit 0)
+ *      I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask)
+{
+       u64 qsfp_oe, target_oe;
+
+       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+       if (mask) {
+               /* We are writing register bits, so lock access */
+               dir &= mask;
+               data &= mask;
+
+               qsfp_oe = read_csr(dd, target_oe);
+               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+               write_csr(dd, target_oe, qsfp_oe);
+       }
+       /* We are exclusively reading bits here, but it is unlikely
+        * we'll get valid data when we set the direction of the pin
+        * in the same call, so read should call this function again
+        * to get valid data
+        */
+       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+       if (sc) {
+               struct hfi1_devdata *dd = sc->dd;
+               u64 reg;
+               u8 set = (sc->type == SC_USER ?
+                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+               reg = read_kctxt_csr(dd, sc->hw_context,
+                                    SEND_CTXT_CHECK_ENABLE);
+               if (set)
+                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+               else
+                       SET_STATIC_RATE_CONTROL_SMASK(reg);
+               write_kctxt_csr(dd, sc->hw_context,
+                               SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+       int ret = 0;
+       u64 reg;
+
+       if (dd->icode != ICODE_RTL_SILICON) {
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+                                   __func__);
+               return -EINVAL;
+       }
+       reg = read_csr(dd, ASIC_STS_THERM);
+       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+                     ASIC_STS_THERM_CURR_TEMP_MASK);
+       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+                       ASIC_STS_THERM_LO_TEMP_MASK);
+       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+                       ASIC_STS_THERM_HI_TEMP_MASK);
+       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+                         ASIC_STS_THERM_CRIT_TEMP_MASK);
+       /* triggers is a 3-bit value - 1 bit per trigger. */
+       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+       return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+       int i;
+
+       /*
+        * In HFI, the mask needs to be 1 to allow interrupts.
+        */
+       if (enable) {
+               /* enable all interrupts */
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
+
+               init_qsfp_int(dd);
+       } else {
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+       }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
+
+       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+       pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove irqs - must happen before disabling/turning off */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               struct hfi1_msix_entry *me = dd->msix_entries;
+
+               for (i = 0; i < dd->num_msix_entries; i++, me++) {
+                       if (!me->arg) /* => no irq, no affinity */
+                               continue;
+                       hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
+                       free_irq(me->msix.vector, me->arg);
+               }
+       } else {
+               /* INTx */
+               if (dd->requested_intx_irq) {
+                       free_irq(dd->pcidev->irq, dd);
+                       dd->requested_intx_irq = 0;
+               }
+       }
+
+       /* turn off interrupts */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               pci_disable_msix(dd->pcidev);
+       } else {
+               /* INTx */
+               disable_intx(dd->pcidev);
+       }
+
+       /* clean structures */
+       kfree(dd->msix_entries);
+       dd->msix_entries = NULL;
+       dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+       u64 reg;
+       int m, n;
+
+       /* clear from the handled mask of the general interrupt */
+       m = isrc / 64;
+       n = isrc % 64;
+       dd->gi_mask[m] &= ~((u64)1 << n);
+
+       /* direct the chip source to the given MSI-X interrupt */
+       m = isrc / 8;
+       n = isrc % 8;
+       reg = read_csr(dd, CCE_INT_MAP + (8 * m));
+       reg &= ~((u64)0xff << (8 * n));
+       reg |= ((u64)msix_intr & 0xff) << (8 * n);
+       write_csr(dd, CCE_INT_MAP + (8 * m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+                                 int engine, int msix_intr)
+{
+       /*
+        * SDMA engine interrupt sources grouped by type, rather than
+        * engine.  Per-engine interrupts are as follows:
+        *      SDMA
+        *      SDMAProgress
+        *      SDMAIdle
+        */
+       remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+       remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
+                  msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
+                dd->unit);
+       ret = request_irq(dd->pcidev->irq, general_interrupt,
+                         IRQF_SHARED, dd->intx_name, dd);
+       if (ret)
+               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+                          ret);
+       else
+               dd->requested_intx_irq = 1;
+       return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+       int first_general, last_general;
+       int first_sdma, last_sdma;
+       int first_rx, last_rx;
+       int i, ret = 0;
+
+       /* calculate the ranges we are going to use */
+       first_general = 0;
+       last_general = first_general + 1;
+       first_sdma = last_general;
+       last_sdma = first_sdma + dd->num_sdma;
+       first_rx = last_sdma;
+       last_rx = first_rx + dd->n_krcv_queues;
+
+       /*
+        * Sanity check - the code expects all SDMA chip source
+        * interrupts to be in the same CSR, starting at bit 0.  Verify
+        * that this is true by checking the bit location of the start.
+        */
+       BUILD_BUG_ON(IS_SDMA_START % 64);
+
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *me = &dd->msix_entries[i];
+               const char *err_info;
+               irq_handler_t handler;
+               irq_handler_t thread = NULL;
+               void *arg;
+               int idx;
+               struct hfi1_ctxtdata *rcd = NULL;
+               struct sdma_engine *sde = NULL;
+
+               /* obtain the arguments to request_irq */
+               if (first_general <= i && i < last_general) {
+                       idx = i - first_general;
+                       handler = general_interrupt;
+                       arg = dd;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d", dd->unit);
+                       err_info = "general";
+                       me->type = IRQ_GENERAL;
+               } else if (first_sdma <= i && i < last_sdma) {
+                       idx = i - first_sdma;
+                       sde = &dd->per_sdma[idx];
+                       handler = sdma_interrupt;
+                       arg = sde;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d sdma%d", dd->unit, idx);
+                       err_info = "sdma";
+                       remap_sdma_interrupts(dd, idx, i);
+                       me->type = IRQ_SDMA;
+               } else if (first_rx <= i && i < last_rx) {
+                       idx = i - first_rx;
+                       rcd = dd->rcd[idx];
+                       /* no interrupt if no rcd */
+                       if (!rcd)
+                               continue;
+                       /*
+                        * Set the interrupt register and mask for this
+                        * context's interrupt.
+                        */
+                       rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+                       rcd->imask = ((u64)1) <<
+                                       ((IS_RCVAVAIL_START + idx) % 64);
+                       handler = receive_context_interrupt;
+                       thread = receive_context_thread;
+                       arg = rcd;
+                       snprintf(me->name, sizeof(me->name),
+                                DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
+                       err_info = "receive context";
+                       remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+                       me->type = IRQ_RCVCTXT;
+               } else {
+                       /* not in our expected range - complain, then
+                        * ignore it
+                        */
+                       dd_dev_err(dd,
+                                  "Unexpected extra MSI-X interrupt %d\n", i);
+                       continue;
+               }
+               /* no argument, no interrupt */
+               if (!arg)
+                       continue;
+               /* make sure the name is terminated */
+               me->name[sizeof(me->name) - 1] = 0;
+
+               ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
+                                          me->name, arg);
+               if (ret) {
+                       dd_dev_err(dd,
+                                  "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+                                  err_info, me->msix.vector, idx, ret);
+                       return ret;
+               }
+               /*
+                * assign arg after request_irq call, so it will be
+                * cleaned up
+                */
+               me->arg = arg;
+
+               ret = hfi1_get_irq_affinity(dd, me);
+               if (ret)
+                       dd_dev_err(dd,
+                                  "unable to pin IRQ %d\n", ret);
+       }
+
+       return ret;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* all interrupts handled by the general handler */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               dd->gi_mask[i] = ~(u64)0;
+
+       /* all chip interrupts map to MSI-X 0 */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+       struct hfi1_msix_entry *entries;
+       u32 total, request;
+       int i, ret;
+       int single_interrupt = 0; /* we expect to have all the interrupts */
+
+       /*
+        * Interrupt count:
+        *      1 general, "slow path" interrupt (includes the SDMA engines
+        *              slow source, SDMACleanupDone)
+        *      N interrupts - one per used SDMA engine
+        *      M interrupt - one per kernel receive context
+        */
+       total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+       if (!entries) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       /* 1-1 MSI-X entry assignment */
+       for (i = 0; i < total; i++)
+               entries[i].msix.entry = i;
+
+       /* ask for MSI-X interrupts */
+       request = total;
+       request_msix(dd, &request, entries);
+
+       if (request == 0) {
+               /* using INTx */
+               /* dd->num_msix_entries already zero */
+               kfree(entries);
+               single_interrupt = 1;
+               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+       } else {
+               /* using MSI-X */
+               dd->num_msix_entries = request;
+               dd->msix_entries = entries;
+
+               if (request != total) {
+                       /* using MSI-X, with reduced interrupts */
+                       dd_dev_err(
+                               dd,
+                               "cannot handle reduced interrupt case, want %u, got %u\n",
+                               total, request);
+                       ret = -EINVAL;
+                       goto fail;
+               }
+               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+       }
+
+       /* mask all interrupts */
+       set_intr_state(dd, 0);
+       /* clear all pending interrupts */
+       clear_all_interrupts(dd);
+
+       /* reset general handler mask, chip MSI-X mappings */
+       reset_interrupts(dd);
+
+       if (single_interrupt)
+               ret = request_intx_irq(dd);
+       else
+               ret = request_msix_irqs(dd);
+       if (ret)
+               goto fail;
+
+       return 0;
+
+fail:
+       clean_up_interrupts(dd);
+       return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *     num_rcv_contexts - number of contexts being used
+ *     n_krcv_queues - number of kernel contexts
+ *     first_user_ctxt - first non-kernel context in array of contexts
+ *     freectxts  - number of free user contexts
+ *     num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+       int num_kernel_contexts;
+       int total_contexts;
+       int ret;
+       unsigned ngroups;
+       int qos_rmt_count;
+       int user_rmt_reduced;
+
+       /*
+        * Kernel receive contexts:
+        * - min of 2 or 1 context/numa (excluding control context)
+        * - Context 0 - control context (VL15/multicast/error)
+        * - Context 1 - first kernel context
+        * - Context 2 - second kernel context
+        * ...
+        */
+       if (n_krcvqs)
+               /*
+                * n_krcvqs is the sum of module parameter kernel receive
+                * contexts, krcvqs[].  It does not include the control
+                * context, so add that.
+                */
+               num_kernel_contexts = n_krcvqs + 1;
+       else
+               num_kernel_contexts = num_online_nodes() + 1;
+       num_kernel_contexts =
+               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+       /*
+        * Every kernel receive context needs an ACK send context.
+        * one send context is allocated for each VL{0-7} and VL15
+        */
+       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+               dd_dev_err(dd,
+                          "Reducing # kernel rcv contexts to: %d, from %d\n",
+                          (int)(dd->chip_send_contexts - num_vls - 1),
+                          (int)num_kernel_contexts);
+               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+       }
+       /*
+        * User contexts:
+        *      - default to 1 user context per real (non-HT) CPU core if
+        *        num_user_contexts is negative
+        */
+       if (num_user_contexts < 0)
+               num_user_contexts =
+                       cpumask_weight(&dd->affinity->real_cpu_mask);
+
+       total_contexts = num_kernel_contexts + num_user_contexts;
+
+       /*
+        * Adjust the counts given a global max.
+        */
+       if (total_contexts > dd->chip_rcv_contexts) {
+               dd_dev_err(dd,
+                          "Reducing # user receive contexts to: %d, from %d\n",
+                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+                          (int)num_user_contexts);
+               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+               /* recalculate */
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* each user context requires an entry in the RMT */
+       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+               dd_dev_err(dd,
+                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
+                          (int)num_user_contexts,
+                          user_rmt_reduced);
+               /* recalculate */
+               num_user_contexts = user_rmt_reduced;
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* the first N are kernel contexts, the rest are user contexts */
+       dd->num_rcv_contexts = total_contexts;
+       dd->n_krcv_queues = num_kernel_contexts;
+       dd->first_user_ctxt = num_kernel_contexts;
+       dd->num_user_contexts = num_user_contexts;
+       dd->freectxts = num_user_contexts;
+       dd_dev_info(dd,
+                   "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+                   (int)dd->chip_rcv_contexts,
+                   (int)dd->num_rcv_contexts,
+                   (int)dd->n_krcv_queues,
+                   (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+       /*
+        * Receive array allocation:
+        *   All RcvArray entries are divided into groups of 8. This
+        *   is required by the hardware and will speed up writes to
+        *   consecutive entries by using write-combining of the entire
+        *   cacheline.
+        *
+        *   The number of groups are evenly divided among all contexts.
+        *   any left over groups will be given to the first N user
+        *   contexts.
+        */
+       dd->rcv_entries.group_size = RCV_INCREMENT;
+       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+       dd->rcv_entries.nctxt_extra = ngroups -
+               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+                   dd->rcv_entries.ngroups,
+                   dd->rcv_entries.nctxt_extra);
+       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+           MAX_EAGER_ENTRIES * 2) {
+               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+                       dd->rcv_entries.group_size;
+               dd_dev_info(dd,
+                           "RcvArray group count too high, change to %u\n",
+                           dd->rcv_entries.ngroups);
+               dd->rcv_entries.nctxt_extra = 0;
+       }
+       /*
+        * PIO send contexts
+        */
+       ret = init_sc_pools_and_sizes(dd);
+       if (ret >= 0) { /* success */
+               dd->num_send_contexts = ret;
+               dd_dev_info(
+                       dd,
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
+                       dd->chip_send_contexts,
+                       dd->num_send_contexts,
+                       dd->sc_sizes[SC_KERNEL].count,
+                       dd->sc_sizes[SC_ACK].count,
+                       dd->sc_sizes[SC_USER].count,
+                       dd->sc_sizes[SC_VL15].count);
+               ret = 0;        /* success */
+       }
+
+       return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg = 0;
+       int i;
+
+       dd_dev_info(dd, "Setting partition keys\n");
+       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+               reg |= (ppd->pkeys[i] &
+                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+                       ((i % 4) *
+                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+               /* Each register holds 4 PKey values. */
+               if ((i % 4) == 3) {
+                       write_csr(dd, RCV_PARTITION_KEY +
+                                 ((i - 3) * 2), reg);
+                       reg = 0;
+               }
+       }
+
+       /* Always enable HW pkeys check when pkeys table is set */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /* CceIntMap */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+
+       /* SendCtxtCreditReturnAddr */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+       /* PIO Send buffers */
+       /* SDMA Send buffers */
+       /*
+        * These are not normally read, and (presently) have no method
+        * to be read, so are not pre-initialized
+        */
+
+       /* RcvHdrAddr */
+       /* RcvHdrTailAddr */
+       /* RcvTidFlowTable */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
+       }
+
+       /* RcvArray */
+       for (i = 0; i < dd->chip_rcv_array_count; i++)
+               write_csr(dd, RCV_ARRAY + (8 * i),
+                         RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+       /* RcvQPMapTable */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+                            u64 ctrl_bits)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       /* is the condition present? */
+       reg = read_csr(dd, CCE_STATUS);
+       if ((reg & status_bits) == 0)
+               return;
+
+       /* clear the condition */
+       write_csr(dd, CCE_CTRL, ctrl_bits);
+
+       /* wait for the condition to clear */
+       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if ((reg & status_bits) == 0)
+                       return;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                                  "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+                                  status_bits, reg & status_bits);
+                       return;
+               }
+               udelay(1);
+       }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* CCE_REVISION read-only */
+       /* CCE_REVISION2 read-only */
+       /* CCE_CTRL - bits clear automatically */
+       /* CCE_STATUS read-only, use CceCtrl to clear */
+       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+       for (i = 0; i < CCE_NUM_SCRATCH; i++)
+               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+       /* CCE_ERR_STATUS read-only */
+       write_csr(dd, CCE_ERR_MASK, 0);
+       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+       /* CCE_ERR_FORCE leave alone */
+       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+       /* CCE_PCIE_CTRL leave alone */
+       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+                         CCE_MSIX_TABLE_UPPER_RESETCSR);
+       }
+       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+               /* CCE_MSIX_PBA read-only */
+               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+       }
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP, 0);
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               /* CCE_INT_STATUS read-only */
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+               /* CCE_INT_FORCE leave alone */
+               /* CCE_INT_BLOCKED read-only */
+       }
+       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < 32; i++) {
+               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+       }
+       /*
+        * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+        * only be written 128-byte chunks
+        */
+       /* init RSA engine to clear lingering errors */
+       write_csr(dd, MISC_CFG_RSA_CMD, 1);
+       write_csr(dd, MISC_CFG_RSA_MU, 0);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+       /* MISC_STS_8051_DIGEST read-only */
+       /* MISC_STS_SBM_DIGEST read-only */
+       /* MISC_STS_PCIE_DIGEST read-only */
+       /* MISC_STS_FAB_DIGEST read-only */
+       /* MISC_ERR_STATUS read-only */
+       write_csr(dd, MISC_ERR_MASK, 0);
+       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+       /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * TXE Kernel CSRs
+        */
+       write_csr(dd, SEND_CTRL, 0);
+       __cm_reset(dd, 0);      /* reset CM internal state */
+       /* SEND_CONTEXTS read-only */
+       /* SEND_DMA_ENGINES read-only */
+       /* SEND_PIO_MEM_SIZE read-only */
+       /* SEND_DMA_MEM_SIZE read-only */
+       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
+       /* SEND_PIO_ERR_STATUS read-only */
+       write_csr(dd, SEND_PIO_ERR_MASK, 0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+       /* SEND_PIO_ERR_FORCE leave alone */
+       /* SEND_DMA_ERR_STATUS read-only */
+       write_csr(dd, SEND_DMA_ERR_MASK, 0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+       /* SEND_DMA_ERR_FORCE leave alone */
+       /* SEND_EGRESS_ERR_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+       /* SEND_EGRESS_ERR_FORCE leave alone */
+       write_csr(dd, SEND_BTH_QP, 0);
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+       write_csr(dd, SEND_SC2VLT0, 0);
+       write_csr(dd, SEND_SC2VLT1, 0);
+       write_csr(dd, SEND_SC2VLT2, 0);
+       write_csr(dd, SEND_SC2VLT3, 0);
+       write_csr(dd, SEND_LEN_CHECK0, 0);
+       write_csr(dd, SEND_LEN_CHECK1, 0);
+       /* SEND_ERR_STATUS read-only */
+       write_csr(dd, SEND_ERR_MASK, 0);
+       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+       /* SEND_ERR_FORCE read-only */
+       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
+       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
+       for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
+               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
+       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
+       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
+       /* SEND_CM_CREDIT_USED_STATUS read-only */
+       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       /* SEND_CM_CREDIT_USED_VL read-only */
+       /* SEND_CM_CREDIT_USED_VL15 read-only */
+       /* SEND_EGRESS_CTXT_STATUS read-only */
+       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+       /* SEND_EGRESS_ERR_INFO read-only */
+       /* SEND_EGRESS_ERR_SOURCE read-only */
+
+       /*
+        * TXE Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+       }
+
+       /*
+        * TXE Per-SDMA CSRs
+        */
+       for (i = 0; i < dd->chip_sdma_engines; i++) {
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+               /* SEND_DMA_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+               /* SEND_DMA_HEAD read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+               /* SEND_DMA_IDLE_CNT read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+               /* SEND_DMA_DESC_FETCHED_CNT read-only */
+               /* SEND_DMA_ENG_ERR_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+               /* SEND_DMA_ENG_ERR_FORCE leave alone */
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+       }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int count;
+
+       /*
+        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+        * clear.
+        */
+       count = 0;
+       while (1) {
+               reg = read_csr(dd, RCV_STATUS);
+               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+                       break;
+               /*
+                * Give up after 1ms - maximum wait time.
+                *
+                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
+                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+                *      148 KB / (66% * 250MB/s) = 920us
+                */
+               if (count++ > 500) {
+                       dd_dev_err(dd,
+                                  "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+                                  __func__, reg);
+                       break;
+               }
+               udelay(2); /* do not busy-wait the CSR */
+       }
+
+       /* start the init - expect RcvCtrl to be 0 */
+       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+       /*
+        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+        * period after the write before RcvStatus.RxRbufInitDone is valid.
+        * The delay in the first run through the loop below is sufficient and
+        * required before the first read of RcvStatus.RxRbufInintDone.
+        */
+       read_csr(dd, RCV_CTRL);
+
+       /* wait for the init to finish */
+       count = 0;
+       while (1) {
+               /* delay is required first time through - see above */
+               udelay(2); /* do not busy-wait the CSR */
+               reg = read_csr(dd, RCV_STATUS);
+               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+                       break;
+
+               /* give up after 100us - slowest possible at 33MHz is 73us */
+               if (count++ > 50) {
+                       dd_dev_err(dd,
+                                  "%s: RcvStatus.RxRbufInit not set, continuing\n",
+                                  __func__);
+                       break;
+               }
+       }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /*
+        * RXE Kernel CSRs
+        */
+       write_csr(dd, RCV_CTRL, 0);
+       init_rbufs(dd);
+       /* RCV_STATUS read-only */
+       /* RCV_CONTEXTS read-only */
+       /* RCV_ARRAY_CNT read-only */
+       /* RCV_BUF_SIZE read-only */
+       write_csr(dd, RCV_BTH_QP, 0);
+       write_csr(dd, RCV_MULTICAST, 0);
+       write_csr(dd, RCV_BYPASS, 0);
+       write_csr(dd, RCV_VL15, 0);
+       /* this is a clear-down */
+       write_csr(dd, RCV_ERR_INFO,
+                 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+       /* RCV_ERR_STATUS read-only */
+       write_csr(dd, RCV_ERR_MASK, 0);
+       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+       /* RCV_ERR_FORCE leave alone */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+       for (i = 0; i < 4; i++)
+               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+       }
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+       /*
+        * RXE Kernel and User Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               /* kernel */
+               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+               /* RCV_CTXT_STATUS read-only */
+               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+               /* user */
+               /* RCV_HDR_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+               /* RCV_EGR_INDEX_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+               /* RCV_EGR_OFFSET_TAIL read-only */
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+                       write_uctxt_csr(dd, i,
+                                       RCV_TID_FLOW_TABLE + (8 * j), 0);
+               }
+       }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+       int i;
+       /* init per architecture spec, constrained by hardware capability */
+
+       /* HFI maps sent packets */
+       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+               0,
+               0, 0, 1, 1,
+               2, 2, 3, 3,
+               4, 4, 5, 5,
+               6, 6, 7, 7));
+       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+               1,
+               8, 0, 9, 0,
+               10, 0, 11, 0,
+               12, 0, 13, 0,
+               14, 0, 15, 15));
+       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+               2,
+               16, 0, 17, 0,
+               18, 0, 19, 0,
+               20, 0, 21, 0,
+               22, 0, 23, 0));
+       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+               3,
+               24, 0, 25, 0,
+               26, 0, 27, 0,
+               28, 0, 29, 0,
+               30, 0, 31, 0));
+
+       /* DC maps received packets */
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+               15_0,
+               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+               31_16,
+               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+       /* initialize the cached sc2vl values consistently with h/w */
+       for (i = 0; i < 32; i++) {
+               if (i < 8 || i == 15)
+                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
+               else
+                       *((u8 *)(dd->sc2vl) + i) = 0;
+       }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Put the HFI CSRs in a known state.
+        * Combine this with a DC reset.
+        *
+        * Stop the device from doing anything while we do a
+        * reset.  We know there are no other active users of
+        * the device since we are now in charge.  Turn off
+        * off all outbound and inbound traffic and make sure
+        * the device does not generate any interrupts.
+        */
+
+       /* disable send contexts and SDMA engines */
+       write_csr(dd, SEND_CTRL, 0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+       /* disable port (turn off RXE inbound traffic) and contexts */
+       write_csr(dd, RCV_CTRL, 0);
+       for (i = 0; i < dd->chip_rcv_contexts; i++)
+               write_csr(dd, RCV_CTXT_CTRL, 0);
+       /* mask all interrupt sources */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+
+       /*
+        * DC Reset: do a full DC reset before the register clear.
+        * A recommended length of time to hold is one CSR read,
+        * so reread the CceDcCtrl.  Then, hold the DC in reset
+        * across the clear.
+        */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void)read_csr(dd, CCE_DC_CTRL);
+
+       if (use_flr) {
+               /*
+                * A FLR will reset the SPC core and part of the PCIe.
+                * The parts that need to be restored have already been
+                * saved.
+                */
+               dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+               /* do the FLR, the DC reset will remain */
+               hfi1_pcie_flr(dd);
+
+               /* restore command and BARs */
+               restore_pci_variables(dd);
+
+               if (is_ax(dd)) {
+                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
+                       hfi1_pcie_flr(dd);
+                       restore_pci_variables(dd);
+               }
+       } else {
+               dd_dev_info(dd, "Resetting CSRs with writes\n");
+               reset_cce_csrs(dd);
+               reset_txe_csrs(dd);
+               reset_rxe_csrs(dd);
+               reset_misc_csrs(dd);
+       }
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+
+       /* Set the LED off */
+       setextled(dd, 0);
+
+       /*
+        * Clear the QSFP reset.
+        * An FLR enforces a 0 on all out pins. The driver does not touch
+        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+        * anything plugged constantly in reset, if it pays attention
+        * to RESET_N.
+        * Prime examples of this are optical cables. Set all pins high.
+        * I2CCLK and I2CDAT will change per direction, and INT_N and
+        * MODPRS_N are input only and their value is ignored.
+        */
+       write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+       write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+       init_chip_resources(dd);
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* assign link credit variables */
+       dd->vau = CM_VAU;
+       dd->link_credits = CM_GLOBAL_CREDITS;
+       if (is_ax(dd))
+               dd->link_credits--;
+       dd->vcu = cu_to_vcu(hfi1_cu);
+       /* enough room for 8 MAD packets plus header - 17K */
+       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+       if (dd->vl15_init > dd->link_credits)
+               dd->vl15_init = dd->link_credits;
+
+       write_uninitialized_csrs_and_memories(dd);
+
+       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_pportdata *ppd = &dd->pport[i];
+
+                       set_partition_keys(ppd);
+               }
+       init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+       /* user changed the KDETH_QP */
+       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+               /* out of range or illegal value */
+               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+               kdeth_qp = 0;
+       }
+       if (kdeth_qp == 0)      /* not set, or failed range check */
+               kdeth_qp = DEFAULT_KDETH_QP;
+
+       write_csr(dd, SEND_BTH_QP,
+                 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+                 SEND_BTH_QP_KDETH_QP_SHIFT);
+
+       write_csr(dd, RCV_BTH_QP,
+                 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+                 RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+                            u32 first_ctxt,
+                            u32 last_ctxt)
+{
+       u64 reg = 0;
+       u64 regno = RCV_QP_MAP_TABLE;
+       int i;
+       u64 ctxt = first_ctxt;
+
+       for (i = 0; i < 256; i++) {
+               reg |= ctxt << (8 * (i % 8));
+               ctxt++;
+               if (ctxt > last_ctxt)
+                       ctxt = first_ctxt;
+               if (i % 8 == 7) {
+                       write_csr(dd, regno, reg);
+                       reg = 0;
+                       regno += 8;
+               }
+       }
+
+       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+struct rsm_map_table {
+       u64 map[NUM_MAP_REGS];
+       unsigned int used;
+};
+
+struct rsm_rule_data {
+       u8 offset;
+       u8 pkt_type;
+       u32 field1_off;
+       u32 field2_off;
+       u32 index1_off;
+       u32 index1_width;
+       u32 index2_off;
+       u32 index2_width;
+       u32 mask1;
+       u32 value1;
+       u32 mask2;
+       u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+       struct rsm_map_table *rmt;
+       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+       if (rmt) {
+               memset(rmt->map, rxcontext, sizeof(rmt->map));
+               rmt->used = 0;
+       }
+
+       return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+                                  struct rsm_map_table *rmt)
+{
+       int i;
+
+       if (rmt) {
+               /* write table to chip */
+               for (i = 0; i < NUM_MAP_REGS; i++)
+                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+               /* enable RSM */
+               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+                        struct rsm_rule_data *rrd)
+{
+       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+                 1ull << rule_index | /* enable bit */
+                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np)
+{
+       int i;
+       unsigned int m, n;
+       u8 max_by_vl = 0;
+
+       /* is QOS active at all? */
+       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+           num_vls == 1 ||
+           krcvqsset <= 1)
+               goto no_qos;
+
+       /* determine bits for qpn */
+       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
+               if (krcvqs[i] > max_by_vl)
+                       max_by_vl = krcvqs[i];
+       if (max_by_vl > 32)
+               goto no_qos;
+       m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+       /* determine bits for vl */
+       n = ilog2(__roundup_pow_of_two(num_vls));
+
+       /* reject if too much is used */
+       if ((m + n) > 7)
+               goto no_qos;
+
+       if (mp)
+               *mp = m;
+       if (np)
+               *np = n;
+
+       return 1 << (m + n);
+
+no_qos:
+       if (mp)
+               *mp = 0;
+       if (np)
+               *np = 0;
+       return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       unsigned int rmt_entries;
+       u64 reg;
+
+       if (!rmt)
+               goto bail;
+       rmt_entries = qos_rmt_entries(dd, &m, &n);
+       if (rmt_entries == 0)
+               goto bail;
+       qpns_per_vl = 1 << m;
+
+       /* enough room in the map table? */
+       rmt_entries = 1 << (m + n);
+       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
+               goto bail;
+
+       /* add qos entries to the the RSM map table */
+       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
+               unsigned tctxt;
+
+               for (qpn = 0, tctxt = ctxt;
+                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+                       unsigned idx, regoff, regidx;
+
+                       /* generate the index the hardware will produce */
+                       idx = rmt->used + ((qpn << n) ^ i);
+                       regoff = (idx % 8) * 8;
+                       regidx = idx / 8;
+                       /* replace default with context number */
+                       reg = rmt->map[regidx];
+                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+                               << regoff);
+                       reg |= (u64)(tctxt++) << regoff;
+                       rmt->map[regidx] = reg;
+                       if (tctxt == ctxt + krcvqs[i])
+                               tctxt = ctxt;
+               }
+               ctxt += krcvqs[i];
+       }
+
+       rrd.offset = rmt->used;
+       rrd.pkt_type = 2;
+       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+       rrd.field2_off = LRH_SC_MATCH_OFFSET;
+       rrd.index1_off = LRH_SC_SELECT_OFFSET;
+       rrd.index1_width = n;
+       rrd.index2_off = QPN_SELECT_OFFSET;
+       rrd.index2_width = m + n;
+       rrd.mask1 = LRH_BTH_MASK;
+       rrd.value1 = LRH_BTH_VALUE;
+       rrd.mask2 = LRH_SC_MASK;
+       rrd.value2 = LRH_SC_VALUE;
+
+       /* add rule 0 */
+       add_rsm_rule(dd, 0, &rrd);
+
+       /* mark RSM map entries as used */
+       rmt->used += rmt_entries;
+       /* map everything else to the mcast/err/vl15 context */
+       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
+       dd->qos_shift = n + 1;
+       return;
+bail:
+       dd->qos_shift = 1;
+       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
+}
+
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+                                   struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       u64 reg;
+       int i, idx, regoff, regidx;
+       u8 offset;
+
+       /* there needs to be enough room in the map table */
+       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+               return;
+       }
+
+       /*
+        * RSM will extract the destination context as an index into the
+        * map table.  The destination contexts are a sequential block
+        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+        * Map entries are accessed as offset + extracted value.  Adjust
+        * the added offset so this sequence can be placed anywhere in
+        * the table - as long as the entries themselves do not wrap.
+        * There are only enough bits in offset for the table size, so
+        * start with that to allow for a "negative" offset.
+        */
+       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+                                               (int)dd->first_user_ctxt);
+
+       for (i = dd->first_user_ctxt, idx = rmt->used;
+                               i < dd->num_rcv_contexts; i++, idx++) {
+               /* replace with identity mapping */
+               regoff = (idx % 8) * 8;
+               regidx = idx / 8;
+               reg = rmt->map[regidx];
+               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+               reg |= (u64)i << regoff;
+               rmt->map[regidx] = reg;
+       }
+
+       /*
+        * For RSM intercept of Expected FECN packets:
+        * o packet type 0 - expected
+        * o match on F (bit 95), using select/match 1, and
+        * o match on SH (bit 133), using select/match 2.
+        *
+        * Use index 1 to extract the 8-bit receive context from DestQP
+        * (start at bit 64).  Use that as the RSM map table index.
+        */
+       rrd.offset = offset;
+       rrd.pkt_type = 0;
+       rrd.field1_off = 95;
+       rrd.field2_off = 133;
+       rrd.index1_off = 64;
+       rrd.index1_width = 8;
+       rrd.index2_off = 0;
+       rrd.index2_width = 0;
+       rrd.mask1 = 1;
+       rrd.value1 = 1;
+       rrd.mask2 = 1;
+       rrd.value2 = 1;
+
+       /* add rule 1 */
+       add_rsm_rule(dd, 1, &rrd);
+
+       rmt->used += dd->num_user_contexts;
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+       struct rsm_map_table *rmt;
+
+       /* enable all receive errors */
+       write_csr(dd, RCV_ERR_MASK, ~0ull);
+
+       rmt = alloc_rsm_map_table(dd);
+       /* set up QOS, including the QPN map table */
+       init_qos(dd, rmt);
+       init_user_fecn_handling(dd, rmt);
+       complete_rsm_map_table(dd, rmt);
+       kfree(rmt);
+
+       /*
+        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+        * Max_PayLoad_Size set to its minimum of 128.
+        *
+        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+        * (64 bytes).  Max_Payload_Size is possibly modified upward in
+        * tune_pcie_caps() which is called after this routine.
+        */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+       /* enable all CCE errors */
+       write_csr(dd, CCE_ERR_MASK, ~0ull);
+       /* enable *some* Misc errors */
+       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+       /* enable all DC errors, except LCB */
+       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+                              u32 csr0to3, u32 csr4to7)
+{
+       write_csr(dd, csr0to3,
+                 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
+                 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
+                 2ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
+                 4ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+       write_csr(dd, csr4to7,
+                 8ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
+                 16ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
+                 32ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
+                 64ull * cu <<
+                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+                          SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+                          SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all PIO, SDMA, general, and Egress errors */
+       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+       /* enable all per-context and per-SDMA engine errors */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+       /* set the local CU to AU mapping */
+       assign_local_cm_au_table(dd, dd->vcu);
+
+       /*
+        * Set reasonable default for Credit Return Timer
+        * Don't set on Simulator - causes it to choke.
+        */
+       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+       /*
+        * Enable send-side J_KEY integrity check, unless this is A0 h/w
+        */
+       if (!is_ax(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+
+       /* Enable J_KEY check on receive context. */
+       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+       /*
+        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+        * This check would not have been enabled for A0 h/w, see
+        * set_ctxt_jkey().
+        */
+       if (!is_ax(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       /* Turn off the J_KEY on the receive side */
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+       return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts) {
+               rcd = dd->rcd[ctxt];
+       } else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts) {
+               rcd = dd->rcd[ctxt];
+       } else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+       return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+       aspm_exit(dd);
+       free_cntrs(dd);
+       free_rcverr(dd);
+       clean_up_interrupts(dd);
+       finish_chip_resources(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Information can be shared between the two HFIs on the same ASIC
+ * in the same OS.  This function finds the peer device and sets
+ * up a shared structure.
+ */
+static int init_asic_data(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       struct hfi1_devdata *tmp, *peer = NULL;
+       int ret = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       /* Find our peer device */
+       list_for_each_entry(tmp, &hfi1_dev_list, list) {
+               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+                   dd->unit != tmp->unit) {
+                       peer = tmp;
+                       break;
+               }
+       }
+
+       if (peer) {
+               dd->asic_data = peer->asic_data;
+       } else {
+               dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
+               if (!dd->asic_data) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               mutex_init(&dd->asic_data->asic_resource_mutex);
+       }
+       dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
+
+done:
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return ret;
+}
+
+/*
+ * Set dd->boardname.  Use a generic name if a name is not returned from
+ * EFI variable space.
+ *
+ * Return 0 on success, -ENOMEM if space could not be allocated.
+ */
+static int obtain_boardname(struct hfi1_devdata *dd)
+{
+       /* generic board description */
+       const char generic[] =
+               "Intel Omni-Path Host Fabric Interface Adapter 100 Series";
+       unsigned long size;
+       int ret;
+
+       ret = read_hfi1_efi_var(dd, "description", &size,
+                               (void **)&dd->boardname);
+       if (ret) {
+               dd_dev_info(dd, "Board description not found\n");
+               /* use generic description */
+               dd->boardname = kstrdup(generic, GFP_KERNEL);
+               if (!dd->boardname)
+                       return -ENOMEM;
+       }
+       return 0;
+}
+
+/*
+ * Check the interrupt registers to make sure that they are mapped correctly.
+ * It is intended to help user identify any mismapping by VMM when the driver
+ * is running in a VM. This function should only be called before interrupt
+ * is set up properly.
+ *
+ * Return 0 on success, -EINVAL on failure.
+ */
+static int check_int_registers(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       u64 all_bits = ~(u64)0;
+       u64 mask;
+
+       /* Clear CceIntMask[0] to avoid raising any interrupts */
+       mask = read_csr(dd, CCE_INT_MASK);
+       write_csr(dd, CCE_INT_MASK, 0ull);
+       reg = read_csr(dd, CCE_INT_MASK);
+       if (reg)
+               goto err_exit;
+
+       /* Clear all interrupt status bits */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg)
+               goto err_exit;
+
+       /* Set all interrupt status bits */
+       write_csr(dd, CCE_INT_FORCE, all_bits);
+       reg = read_csr(dd, CCE_INT_STATUS);
+       if (reg != all_bits)
+               goto err_exit;
+
+       /* Restore the interrupt mask */
+       write_csr(dd, CCE_INT_CLEAR, all_bits);
+       write_csr(dd, CCE_INT_MASK, mask);
+
+       return 0;
+err_exit:
+       write_csr(dd, CCE_INT_MASK, mask);
+       dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
+       return -EINVAL;
+}
+
+/**
+ * Allocate and initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+                                 const struct pci_device_id *ent)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       u64 reg;
+       int i, ret;
+       static const char * const inames[] = { /* implementation names */
+               "RTL silicon",
+               "RTL VCS simulation",
+               "RTL FPGA emulation",
+               "Functional simulator"
+       };
+       struct pci_dev *parent = pdev->bus->self;
+
+       dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+                               sizeof(struct hfi1_pportdata));
+       if (IS_ERR(dd))
+               goto bail;
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               int vl;
+               /* init common fields */
+               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+               /* DC supports 4 link widths */
+               ppd->link_width_supported =
+                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_supported =
+                       ppd->link_width_supported;
+               /* start out enabling only 4X */
+               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_enabled =
+                                       ppd->link_width_downgrade_supported;
+               /* link width active is 0 when link is down */
+               /* link width downgrade active is 0 when link is down */
+
+               if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
+                   num_vls > HFI1_MAX_VLS_SUPPORTED) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Invalid num_vls %u, using %u VLs\n",
+                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
+                       num_vls = HFI1_MAX_VLS_SUPPORTED;
+               }
+               ppd->vls_supported = num_vls;
+               ppd->vls_operational = ppd->vls_supported;
+               ppd->actual_vls_operational = ppd->vls_supported;
+               /* Set the default MTU. */
+               for (vl = 0; vl < num_vls; vl++)
+                       dd->vld[vl].mtu = hfi1_max_mtu;
+               dd->vld[15].mtu = MAX_MAD_PACKET;
+               /*
+                * Set the initial values to reasonable default, will be set
+                * for real when link is up.
+                */
+               ppd->lstate = IB_PORT_DOWN;
+               ppd->overrun_threshold = 0x4;
+               ppd->phy_error_threshold = 0xf;
+               ppd->port_crc_mode_enabled = link_crc_mask;
+               /* initialize supported LTP CRC mode */
+               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* initialize enabled LTP CRC mode */
+               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+               /* start in offline */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               init_vl_arb_caches(ppd);
+               ppd->last_pstate = 0xff; /* invalid value */
+       }
+
+       dd->link_default = HLS_DN_POLL;
+
+       /*
+        * Do remaining PCIe setup and save PCIe values in dd.
+        * Any error printing is already done by the init code.
+        * On return, we have the chip mapped.
+        */
+       ret = hfi1_pcie_ddinit(dd, pdev, ent);
+       if (ret < 0)
+               goto bail_free;
+
+       /* verify that reads actually work, save revision for reset check */
+       dd->revision = read_csr(dd, CCE_REVISION);
+       if (dd->revision == ~(u64)0) {
+               dd_dev_err(dd, "cannot read chip CSRs\n");
+               ret = -EINVAL;
+               goto bail_cleanup;
+       }
+       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+       /*
+        * Check interrupt registers mapping if the driver has no access to
+        * the upstream component. In this case, it is likely that the driver
+        * is running in a VM.
+        */
+       if (!parent) {
+               ret = check_int_registers(dd);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /*
+        * obtain the hardware ID - NOT related to unit, which is a
+        * software enumeration
+        */
+       reg = read_csr(dd, CCE_REVISION2);
+       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+                                       & CCE_REVISION2_HFI_ID_MASK;
+       /* the variable size will remove unwanted bits */
+       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+                   dd->icode < ARRAY_SIZE(inames) ?
+                   inames[dd->icode] : "unknown", (int)dd->irev);
+
+       /* speeds the hardware can support */
+       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+       /* speeds allowed to run at */
+       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+       /* give a reasonable active value, will be set on link up */
+       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+       /* fix up link widths for emulation _p */
+       ppd = dd->pport;
+       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+               ppd->link_width_supported =
+                       ppd->link_width_enabled =
+                       ppd->link_width_downgrade_supported =
+                       ppd->link_width_downgrade_enabled =
+                               OPA_LINK_WIDTH_1X;
+       }
+       /* insure num_vls isn't larger than number of sdma engines */
+       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+                          num_vls, dd->chip_sdma_engines);
+               num_vls = dd->chip_sdma_engines;
+               ppd->vls_supported = dd->chip_sdma_engines;
+               ppd->vls_operational = ppd->vls_supported;
+       }
+
+       /*
+        * Convert the ns parameter to the 64 * cclocks used in the CSR.
+        * Limit the max if larger than the field holds.  If timeout is
+        * non-zero, then the calculated field will be at least 1.
+        *
+        * Must be after icode is set up - the cclock rate depends
+        * on knowing the hardware being used.
+        */
+       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+       if (dd->rcv_intr_timeout_csr >
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+               dd->rcv_intr_timeout_csr =
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+               dd->rcv_intr_timeout_csr = 1;
+
+       /* needs to be done before we look for the peer device */
+       read_guid(dd);
+
+       /* set up shared ASIC data with peer device */
+       ret = init_asic_data(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* obtain chip sizes, reset chip CSRs */
+       init_chip(dd);
+
+       /* read in the PCIe link speed information */
+       ret = pcie_speeds(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* Needs to be called before hfi1_firmware_init */
+       get_platform_config(dd);
+
+       /* read in firmware */
+       ret = hfi1_firmware_init(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /*
+        * In general, the PCIe Gen3 transition must occur after the
+        * chip has been idled (so it won't initiate any PCIe transactions
+        * e.g. an interrupt) and before the driver changes any registers
+        * (the transition will reset the registers).
+        *
+        * In particular, place this call after:
+        * - init_chip()     - the chip will not initiate any PCIe transactions
+        * - pcie_speeds()   - reads the current link speed
+        * - hfi1_firmware_init() - the needed firmware is ready to be
+        *                          downloaded
+        */
+       ret = do_pcie_gen3_transition(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* start setting dd values and adjusting CSRs */
+       init_early_variables(dd);
+
+       parse_platform_config(dd);
+
+       ret = obtain_boardname(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       snprintf(dd->boardversion, BOARD_VERS_MAX,
+                "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
+                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+                (u32)dd->majrev,
+                (u32)dd->minrev,
+                (dd->revision >> CCE_REVISION_SW_SHIFT)
+                   & CCE_REVISION_SW_MASK);
+
+       /*
+        * The real cpu mask is part of the affinity struct but has to be
+        * initialized earlier than the rest of the affinity struct because it
+        * is needed to calculate the number of user contexts in
+        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+        * which initializes the rest of the affinity struct members,
+        * depends on set_up_context_variables() for the number of kernel
+        * contexts, so it cannot be called before set_up_context_variables().
+        */
+       ret = init_real_cpu_mask(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = set_up_context_variables(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set initial RXE CSRs */
+       init_rxe(dd);
+       /* set initial TXE CSRs */
+       init_txe(dd);
+       /* set initial non-RXE, non-TXE CSRs */
+       init_other(dd);
+       /* set up KDETH QP prefix in both RX and TX CSRs */
+       init_kdeth_qp(dd);
+
+       hfi1_dev_affinity_init(dd);
+
+       /* send contexts must be set up before receive contexts */
+       ret = init_send_contexts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = hfi1_create_ctxts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+       /*
+        * rcd[0] is guaranteed to be valid by this point. Also, all
+        * context are using the same value, as per the module parameter.
+        */
+       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+       ret = init_pervl_scs(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* sdma init */
+       for (i = 0; i < dd->num_pports; ++i) {
+               ret = sdma_init(dd, i);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /* use contexts created by hfi1_create_ctxts */
+       ret = set_up_interrupts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set up LCB access - must be after set_up_interrupts() */
+       init_lcb_access(dd);
+
+       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+                dd->base_guid & 0xFFFFFF);
+
+       dd->oui1 = dd->base_guid >> 56 & 0xFF;
+       dd->oui2 = dd->base_guid >> 48 & 0xFF;
+       dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+       if (ret)
+               goto bail_clear_intr;
+       check_fabric_firmware_versions(dd);
+
+       thermal_init(dd);
+
+       ret = init_cntrs(dd);
+       if (ret)
+               goto bail_clear_intr;
+
+       ret = init_rcverr(dd);
+       if (ret)
+               goto bail_free_cntrs;
+
+       ret = eprom_init(dd);
+       if (ret)
+               goto bail_free_rcverr;
+
+       goto bail;
+
+bail_free_rcverr:
+       free_rcverr(dd);
+bail_free_cntrs:
+       free_cntrs(dd);
+bail_clear_intr:
+       clean_up_interrupts(dd);
+bail_cleanup:
+       hfi1_pcie_ddcleanup(dd);
+bail_free:
+       hfi1_free_devdata(dd);
+       dd = ERR_PTR(ret);
+bail:
+       return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+                       u32 dw_len)
+{
+       u32 delta_cycles;
+       u32 current_egress_rate = ppd->current_egress_rate;
+       /* rates here are in units of 10^6 bits/sec */
+
+       if (desired_egress_rate == -1)
+               return 0; /* shouldn't happen */
+
+       if (desired_egress_rate >= current_egress_rate)
+               return 0; /* we can't help go faster, only slower */
+
+       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+                       egress_cycles(dw_len * 4, current_egress_rate);
+
+       return (u16)delta_cycles;
+}
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+              u32 dw_len)
+{
+       u64 pbc, delay = 0;
+
+       if (unlikely(srate_mbs))
+               delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+       pbc = flags
+               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+               | (dw_len & PBC_LENGTH_DWS_MASK)
+                       << PBC_LENGTH_DWS_SHIFT;
+
+       return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+       dd_dev_err((dd),                                                \
+                  "Thermal sensor initialization failed: %s (%d)\n",   \
+                  (reason), (ret))
+
+/*
+ * Initialize the thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       if (dd->icode != ICODE_RTL_SILICON ||
+           check_chip_resource(dd, CR_THERM_INIT, NULL))
+               return ret;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Acquire SBus");
+               return ret;
+       }
+
+       dd_dev_info(dd, "Initializing thermal sensor\n");
+       /* Disable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+       msleep(100);
+       /* Thermal Sensor Initialization */
+       /*    Step 1: Reset the Thermal SBus Receiver */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               RESET_SBUS_RECEIVER, 0);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Bus Reset");
+               goto done;
+       }
+       /*    Step 2: Set Reset bit in Thermal block */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x1);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Therm Block Reset");
+               goto done;
+       }
+       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+                               WRITE_SBUS_RECEIVER, 0x32);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Clock Div");
+               goto done;
+       }
+       /*    Step 4: Select temperature mode */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+                               WRITE_SBUS_RECEIVER,
+                               SBUS_THERM_MONITOR_MODE);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Mode Sel");
+               goto done;
+       }
+       /*    Step 5: De-assert block reset and start conversion */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x2);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Reset Deassert");
+               goto done;
+       }
+       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+       msleep(22);
+
+       /* Enable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+
+       /* Set initialized flag */
+       ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
+       if (ret)
+               THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
+
+done:
+       release_chip_resource(dd, CR_SBUS);
+       return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       /*
+        * Thermal Critical Interrupt
+        * Put the device into forced freeze mode, take link down to
+        * offline, and put DC into reset.
+        */
+       dd_dev_emerg(dd,
+                    "Critical temperature reached! Forcing device into freeze mode!\n");
+       dd->flags |= HFI1_FORCED_FREEZE;
+       start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
+       /*
+        * Shut DC down as much and as quickly as possible.
+        *
+        * Step 1: Take the link down to OFFLINE. This will cause the
+        *         8051 to put the Serdes in reset. However, we don't want to
+        *         go through the entire link state machine since we want to
+        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+        *         but rather an attempt to save the chip.
+        *         Code below is almost the same as quiet_serdes() but avoids
+        *         all the extra work and the sleeps.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+       set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
+                               PLS_OFFLINE);
+       /*
+        * Step 2: Shutdown LCB and 8051
+        *         After shutdown, do not restore DC_CFG_RESET value.
+        */
+       dc_shutdown(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
new file mode 100644 (file)
index 0000000..66a3279
--- /dev/null
@@ -0,0 +1,1374 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000    /* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000)   /* 32 MB */
+#define PIO_BLOCK_SIZE 64                      /* bytes */
+#define SDMA_BLOCK_SIZE 64                     /* bytes */
+#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
+#define PIO_CMASK 0x7ff        /* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES    2048      /* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024      /* max receive expected pairs */
+/*
+ * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+ * at 64 bytes for all generation one devices
+ */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR               BIT_ULL(31)
+#define PBC_DC_INFO_SHIFT      (30)
+#define PBC_DC_INFO            BIT_ULL(PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP           BIT_ULL(29)
+#define PBC_PACKET_BYPASS      BIT_ULL(28)
+#define PBC_CREDIT_RETURN      BIT_ULL(25)
+#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24)
+#define PBC_TEST_BAD_ICRC      BIT_ULL(23)
+#define PBC_FECN               BIT_ULL(22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0   /* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1   /* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE   0x2   /* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+       (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+       PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+       (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+       (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START     0
+#define IS_SDMAENG_ERR_START    16
+#define IS_SENDCTXT_ERR_START   32
+#define IS_SDMA_START          192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START               240
+#define IS_DC_START                    248
+#define IS_RCVAVAIL_START              256
+#define IS_RCVURGENT_START             416
+#define IS_SENDCREDIT_START            576
+#define IS_RESERVED_START              736
+#define IS_MAX_SOURCES         768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END             IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END             IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END            IS_SDMA_START
+#define IS_SDMA_END                    IS_VARIOUS_START
+#define IS_VARIOUS_END         IS_DC_START
+#define IS_DC_END                      IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END                IS_RCVURGENT_START
+#define IS_RCVURGENT_END               IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END              IS_RESERVED_START
+#define IS_RESERVED_END                IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT              242
+#define QSFP2_INT              243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN    0x1
+#define LSTATE_INIT    0x2
+#define LSTATE_ARMED   0x3
+#define LSTATE_ACTIVE  0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED                      0x30
+#define PLS_OFFLINE                               0x90
+#define PLS_OFFLINE_QUIET                         0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM           0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT     0x92
+#define PLS_OFFLINE_REPORT_FAILURE                0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC    0x94
+#define PLS_POLLING                               0x20
+#define PLS_POLLING_QUIET                         0x20
+#define PLS_POLLING_ACTIVE                        0x21
+#define PLS_CONFIGPHY                     0x40
+#define PLS_CONFIGPHY_DEBOUCE             0x40
+#define PLS_CONFIGPHY_ESTCOMM             0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT           0x42
+#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE   0x43
+#define PLS_CONFIGPHY_OPTEQ                       0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING    0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE        0x45
+#define PLS_CONFIGPHY_VERIFYCAP                   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE          0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT                      0x48
+#define PLS_CONFIGLT_CONFIGURE            0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE         0x49
+#define PLS_LINKUP                                0x50
+#define PLS_PHYTEST                               0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK      0xe1
+#define PLS_QUICK_LINKUP                          0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA  0x01
+#define HCMD_READ_CONFIG_DATA  0x02
+#define HCMD_CHANGE_PHY_STATE  0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC                 0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR      0x07
+#define HCMD_WRITE_LCB_CSR     0x08
+#define HCMD_INTERFACE_TEST       0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED               BIT(0)
+#define UNKNOWN_FRAME                  BIT(1)
+#define TARGET_BER_NOT_MET             BIT(2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK        BIT(3)
+#define FAILED_SERDES_INIT             BIT(4)
+#define FAILED_LNI_POLLING             BIT(5)
+#define FAILED_LNI_DEBOUNCE            BIT(6)
+#define FAILED_LNI_ESTBCOMM            BIT(7)
+#define FAILED_LNI_OPTEQ               BIT(8)
+#define FAILED_LNI_VERIFY_CAP1         BIT(9)
+#define FAILED_LNI_VERIFY_CAP2         BIT(10)
+#define FAILED_LNI_CONFIGLT            BIT(11)
+#define HOST_HANDSHAKE_TIMEOUT         BIT(12)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+                       | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+                       | FAILED_LNI_VERIFY_CAP1 \
+                       | FAILED_LNI_VERIFY_CAP2 \
+                       | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE          BIT(0)
+#define BC_PWR_MGM_MSG         BIT(1)
+#define BC_SMA_MSG             BIT(2)
+#define BC_BCC_UNKNOWN_MSG     BIT(3)
+#define BC_IDLE_UNKNOWN_MSG    BIT(4)
+#define EXT_DEVICE_CFG_REQ     BIT(5)
+#define VERIFY_CAP_FRAME       BIT(6)
+#define LINKUP_ACHIEVED                BIT(7)
+#define LINK_GOING_DOWN                BIT(8)
+#define LINK_WIDTH_DOWNGRADED  BIT(9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG       0x01
+#define HREQ_SAVE_CONFIG       0x02
+#define HREQ_READ_CONFIG       0x03
+#define HREQ_SET_TX_EQ_ABS     0x04
+#define HREQ_SET_TX_EQ_REL     0x05
+#define HREQ_ENABLE            0x06
+#define HREQ_CONFIG_DONE       0xfe
+#define HREQ_INTERFACE_TEST    0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID           0x01
+#define HREQ_SUCCESS           0x02
+#define HREQ_NOT_SUPPORTED             0x03
+#define HREQ_FEATURE_NOT_SUPPORTED     0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED  0xfe
+#define HREQ_EXECUTION_ONGOING 0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU                   0x2
+#define IDLE_SMA                   0x3
+#define IDLE_POWER_MGMT            0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM   1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER       (4 * 1024)
+#define MAX_EAGER_BUFFER       (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT BIT(RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT     0x01  /* do not do recovery */
+#define FREEZE_SELF         0x02       /* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04  /* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON              0x00
+#define ICODE_RTL_VCS_SIMULATION       0x01
+#define ICODE_FPGA_EMULATION   0x02
+#define ICODE_FUNCTIONAL_SIMULATOR     0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS    0x8
+
+/* 8051 general register Field IDs */
+#define LINK_OPTIMIZATION_SETTINGS   0x00
+#define LINK_TUNING_PARAMETERS      0x02
+#define DC_HOST_COMM_SETTINGS       0x03
+#define TX_SETTINGS                 0x06
+#define VERIFY_CAP_LOCAL_PHY        0x07
+#define VERIFY_CAP_LOCAL_FABRIC             0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
+#define LOCAL_DEVICE_ID                     0x0a
+#define LOCAL_LNI_INFO              0x0c
+#define REMOTE_LNI_INFO              0x0d
+#define MISC_STATUS                 0x0e
+#define VERIFY_CAP_REMOTE_PHY       0x0f
+#define VERIFY_CAP_REMOTE_FABRIC     0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE    0x12
+#define LAST_REMOTE_STATE_COMPLETE   0x13
+#define LINK_QUALITY_INFO            0x14
+#define REMOTE_DEVICE_ID            0x15
+#define LINK_DOWN_REASON            0x16
+
+/* 8051 lane specific register field IDs */
+#define TX_EQ_SETTINGS         0x00
+#define CHANNEL_LOSS_SETTINGS  0x05
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LINK_TUNING_PARAMETERS fields */
+#define TUNING_METHOD_SHIFT 24
+
+/* LINK_OPTIMIZATION_SETTINGS fields */
+#define ENABLE_EXT_DEV_CONFIG_SHIFT 24
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT   0x0
+#define LOAD_DATA_DATA_MASK   0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT   0x0
+#define READ_DATA_DATA_MASK   0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT           0
+#define ENABLE_LANE_TX_MASK            0xff
+#define TX_POLARITY_INVERSION_SHIFT    8
+#define TX_POLARITY_INVERSION_MASK     0xff
+#define RX_POLARITY_INVERSION_SHIFT    16
+#define RX_POLARITY_INVERSION_MASK     0xff
+#define MAX_RATE_SHIFT                 24
+#define MAX_RATE_MASK                  0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK  0x1
+#define POWER_MANAGEMENT_SHIFT                 0x0
+#define POWER_MANAGEMENT_MASK                  0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7   /* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT      0
+#define VAU_MASK       0x0007
+#define Z_SHIFT                3
+#define Z_MASK         0x0001
+#define VCU_SHIFT      4
+#define VCU_MASK       0x0007
+#define VL15BUF_SHIFT  8
+#define VL15BUF_MASK   0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK 0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0             /* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff         /* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK  0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK  0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK  0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL       0x1
+#define PWRM_BANDWIDTH_CONTROL 0x2
+
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
+/* verify capability fabric CRC size bits */
+enum {
+       CAP_CRC_14B = (1 << 0), /* 14b CRC */
+       CAP_CRC_48B = (1 << 1), /* 48b CRC */
+       CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK  0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK  0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B                    0x0     /* 16b CRC */
+#define LCB_CRC_14B                    0x1     /* 14b CRC */
+#define LCB_CRC_48B                    0x2     /* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE       0x3     /* 12b-16b per lane CRC */
+
+/*
+ * the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo)
+ */
+enum {
+       PORT_LTP_CRC_MODE_NONE = 0,
+       PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+       PORT_LTP_CRC_MODE_48 = 4,
+               /* 48-bit overlapping LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_PER_LANE = 8
+               /* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000                /* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000   /* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20       /* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000   /* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10          /* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
+#define ASIC_CCLOCK_PS  1242   /* 805 MHz */
+#define FPGA_CCLOCK_PS 30300   /*  33 MHz */
+
+/*
+ * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+       (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+               | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE  0       /* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB   2
+#define LOOPBACK_CABLE 3       /* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+       struct hfi1_devdata *dd,
+       int ctxt,
+       u32 offset0)
+{
+       return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define SBUS_MASTER_BROADCAST 0xfd
+#define NUM_PCIE_SERDES 16     /* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+extern uint platform_config_load;
+
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+
+/*
+ * Bitmask of dynamic access for ASIC block chip resources.  Each HFI has its
+ * own range of bits for the resource so it can clear its own bits on
+ * starting and exiting.  If either HFI has the resource bit set, the
+ * resource is in use.  The separate bit ranges are:
+ *     HFI0 bits  7:0
+ *     HFI1 bits 15:8
+ */
+#define CR_SBUS  0x01  /* SBUS, THERM, and PCIE registers */
+#define CR_EPROM 0x02  /* EEP, GPIO registers */
+#define CR_I2C1  0x04  /* QSFP1_OE register */
+#define CR_I2C2  0x08  /* QSFP2_OE register */
+#define CR_DYN_SHIFT 8 /* dynamic flag shift */
+#define CR_DYN_MASK  ((1ull << CR_DYN_SHIFT) - 1)
+
+/*
+ * Bitmask of static ASIC states these are outside of the dynamic ASIC
+ * block chip resources above.  These are to be set once and never cleared.
+ * Must be holding the SBus dynamic flag when setting.
+ */
+#define CR_THERM_INIT  0x010000
+
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+                        const char *func);
+void init_chip_resources(struct hfi1_devdata *dd);
+void finish_chip_resources(struct hfi1_devdata *dd);
+
+/* ms wait time for access to an SBus resoure */
+#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
+
+/* ms wait time for a qsfp (i2c) chain to become available */
+#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
+
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void reset_qsfp(struct hfi1_pportdata *ppd);
+void qsfp_event(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
+int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
+int start_link(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+                                int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END   DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+       return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+u32 read_logical_state(struct hfi1_devdata *dd);
+void force_recv_intr(struct hfi1_ctxtdata *rcd);
+
+/* Per VL indexes */
+enum {
+       C_VL_0 = 0,
+       C_VL_1,
+       C_VL_2,
+       C_VL_3,
+       C_VL_4,
+       C_VL_5,
+       C_VL_6,
+       C_VL_7,
+       C_VL_15,
+       C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+       return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+       return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+       C_RCV_OVF = 0,
+       C_RX_TID_FULL,
+       C_RX_TID_INVALID,
+       C_RX_TID_FLGMS,
+       C_RX_CTX_EGRS,
+       C_RCV_TID_FLSMS,
+       C_CCE_PCI_CR_ST,
+       C_CCE_PCI_TR_ST,
+       C_CCE_PIO_WR_ST,
+       C_CCE_ERR_INT,
+       C_CCE_SDMA_INT,
+       C_CCE_MISC_INT,
+       C_CCE_RCV_AV_INT,
+       C_CCE_RCV_URG_INT,
+       C_CCE_SEND_CR_INT,
+       C_DC_UNC_ERR,
+       C_DC_RCV_ERR,
+       C_DC_FM_CFG_ERR,
+       C_DC_RMT_PHY_ERR,
+       C_DC_DROPPED_PKT,
+       C_DC_MC_XMIT_PKTS,
+       C_DC_MC_RCV_PKTS,
+       C_DC_XMIT_CERR,
+       C_DC_RCV_CERR,
+       C_DC_RCV_FCC,
+       C_DC_XMIT_FCC,
+       C_DC_XMIT_FLITS,
+       C_DC_RCV_FLITS,
+       C_DC_XMIT_PKTS,
+       C_DC_RCV_PKTS,
+       C_DC_RX_FLIT_VL,
+       C_DC_RX_PKT_VL,
+       C_DC_RCV_FCN,
+       C_DC_RCV_FCN_VL,
+       C_DC_RCV_BCN,
+       C_DC_RCV_BCN_VL,
+       C_DC_RCV_BBL,
+       C_DC_RCV_BBL_VL,
+       C_DC_MARK_FECN,
+       C_DC_MARK_FECN_VL,
+       C_DC_TOTAL_CRC,
+       C_DC_CRC_LN0,
+       C_DC_CRC_LN1,
+       C_DC_CRC_LN2,
+       C_DC_CRC_LN3,
+       C_DC_CRC_MULT_LN,
+       C_DC_TX_REPLAY,
+       C_DC_RX_REPLAY,
+       C_DC_SEQ_CRC_CNT,
+       C_DC_ESC0_ONLY_CNT,
+       C_DC_ESC0_PLUS1_CNT,
+       C_DC_ESC0_PLUS2_CNT,
+       C_DC_REINIT_FROM_PEER_CNT,
+       C_DC_SBE_CNT,
+       C_DC_MISC_FLG_CNT,
+       C_DC_PRF_GOOD_LTP_CNT,
+       C_DC_PRF_ACCEPTED_LTP_CNT,
+       C_DC_PRF_RX_FLIT_CNT,
+       C_DC_PRF_TX_FLIT_CNT,
+       C_DC_PRF_CLK_CNTR,
+       C_DC_PG_DBG_FLIT_CRDTS_CNT,
+       C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+       C_DC_PG_STS_TX_SBE_CNT,
+       C_DC_PG_STS_TX_MBE_CNT,
+       C_SW_CPU_INTR,
+       C_SW_CPU_RCV_LIM,
+       C_SW_VTX_WAIT,
+       C_SW_PIO_WAIT,
+       C_SW_PIO_DRAIN,
+       C_SW_KMEM_WAIT,
+       C_SW_SEND_SCHED,
+       C_SDMA_DESC_FETCHED_CNT,
+       C_SDMA_INT_CNT,
+       C_SDMA_ERR_CNT,
+       C_SDMA_IDLE_INT_CNT,
+       C_SDMA_PROGRESS_INT_CNT,
+/* MISC_ERR_STATUS */
+       C_MISC_PLL_LOCK_FAIL_ERR,
+       C_MISC_MBIST_FAIL_ERR,
+       C_MISC_INVALID_EEP_CMD_ERR,
+       C_MISC_EFUSE_DONE_PARITY_ERR,
+       C_MISC_EFUSE_WRITE_ERR,
+       C_MISC_EFUSE_READ_BAD_ADDR_ERR,
+       C_MISC_EFUSE_CSR_PARITY_ERR,
+       C_MISC_FW_AUTH_FAILED_ERR,
+       C_MISC_KEY_MISMATCH_ERR,
+       C_MISC_SBUS_WRITE_FAILED_ERR,
+       C_MISC_CSR_WRITE_BAD_ADDR_ERR,
+       C_MISC_CSR_READ_BAD_ADDR_ERR,
+       C_MISC_CSR_PARITY_ERR,
+/* CceErrStatus */
+       /*
+       * A special counter that is the aggregate count
+       * of all the cce_err_status errors.  The remainder
+       * are actual bits in the CceErrStatus register.
+       */
+       C_CCE_ERR_STATUS_AGGREGATED_CNT,
+       C_CCE_MSIX_CSR_PARITY_ERR,
+       C_CCE_INT_MAP_UNC_ERR,
+       C_CCE_INT_MAP_COR_ERR,
+       C_CCE_MSIX_TABLE_UNC_ERR,
+       C_CCE_MSIX_TABLE_COR_ERR,
+       C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
+       C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_SEG_WRITE_BAD_ADDR_ERR,
+       C_CCE_SEG_READ_BAD_ADDR_ERR,
+       C_LA_TRIGGERED,
+       C_CCE_TRGT_CPL_TIMEOUT_ERR,
+       C_PCIC_RECEIVE_PARITY_ERR,
+       C_PCIC_TRANSMIT_BACK_PARITY_ERR,
+       C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
+       C_PCIC_CPL_DAT_Q_UNC_ERR,
+       C_PCIC_CPL_HD_Q_UNC_ERR,
+       C_PCIC_POST_DAT_Q_UNC_ERR,
+       C_PCIC_POST_HD_Q_UNC_ERR,
+       C_PCIC_RETRY_SOT_MEM_UNC_ERR,
+       C_PCIC_RETRY_MEM_UNC_ERR,
+       C_PCIC_N_POST_DAT_Q_PARITY_ERR,
+       C_PCIC_N_POST_H_Q_PARITY_ERR,
+       C_PCIC_CPL_DAT_Q_COR_ERR,
+       C_PCIC_CPL_HD_Q_COR_ERR,
+       C_PCIC_POST_DAT_Q_COR_ERR,
+       C_PCIC_POST_HD_Q_COR_ERR,
+       C_PCIC_RETRY_SOT_MEM_COR_ERR,
+       C_PCIC_RETRY_MEM_COR_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
+       C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
+       C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_CSR_CFG_BUS_PARITY_ERR,
+       C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
+       C_CCE_RSPD_DATA_PARITY_ERR,
+       C_CCE_TRGT_ACCESS_ERR,
+       C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
+       C_CCE_CSR_WRITE_BAD_ADDR_ERR,
+       C_CCE_CSR_READ_BAD_ADDR_ERR,
+       C_CCE_CSR_PARITY_ERR,
+/* RcvErrStatus */
+       C_RX_CSR_PARITY_ERR,
+       C_RX_CSR_WRITE_BAD_ADDR_ERR,
+       C_RX_CSR_READ_BAD_ADDR_ERR,
+       C_RX_DMA_CSR_UNC_ERR,
+       C_RX_DMA_DQ_FSM_ENCODING_ERR,
+       C_RX_DMA_EQ_FSM_ENCODING_ERR,
+       C_RX_DMA_CSR_PARITY_ERR,
+       C_RX_RBUF_DATA_COR_ERR,
+       C_RX_RBUF_DATA_UNC_ERR,
+       C_RX_DMA_DATA_FIFO_RD_COR_ERR,
+       C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
+       C_RX_DMA_HDR_FIFO_RD_COR_ERR,
+       C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
+       C_RX_RBUF_DESC_PART2_COR_ERR,
+       C_RX_RBUF_DESC_PART2_UNC_ERR,
+       C_RX_RBUF_DESC_PART1_COR_ERR,
+       C_RX_RBUF_DESC_PART1_UNC_ERR,
+       C_RX_HQ_INTR_FSM_ERR,
+       C_RX_HQ_INTR_CSR_PARITY_ERR,
+       C_RX_LOOKUP_CSR_PARITY_ERR,
+       C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
+       C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
+       C_RX_LOOKUP_DES_PART2_PARITY_ERR,
+       C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
+       C_RX_LOOKUP_DES_PART1_UNC_ERR,
+       C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
+       C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
+       C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
+       C_RX_RBUF_FL_INITDONE_PARITY_ERR,
+       C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
+       C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
+       C_RX_RBUF_EMPTY_ERR,
+       C_RX_RBUF_FULL_ERR,
+       C_RX_RBUF_BAD_LOOKUP_ERR,
+       C_RX_RBUF_CTX_ID_PARITY_ERR,
+       C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
+       C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
+       C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
+       C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
+       C_RX_RBUF_LOOKUP_DES_COR_ERR,
+       C_RX_RBUF_LOOKUP_DES_UNC_ERR,
+       C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
+       C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
+       C_RX_RBUF_FREE_LIST_COR_ERR,
+       C_RX_RBUF_FREE_LIST_UNC_ERR,
+       C_RX_RCV_FSM_ENCODING_ERR,
+       C_RX_DMA_FLAG_COR_ERR,
+       C_RX_DMA_FLAG_UNC_ERR,
+       C_RX_DC_SOP_EOP_PARITY_ERR,
+       C_RX_RCV_CSR_PARITY_ERR,
+       C_RX_RCV_QP_MAP_TABLE_COR_ERR,
+       C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
+       C_RX_RCV_DATA_COR_ERR,
+       C_RX_RCV_DATA_UNC_ERR,
+       C_RX_RCV_HDR_COR_ERR,
+       C_RX_RCV_HDR_UNC_ERR,
+       C_RX_DC_INTF_PARITY_ERR,
+       C_RX_DMA_CSR_COR_ERR,
+/* SendPioErrStatus */
+       C_PIO_PEC_SOP_HEAD_PARITY_ERR,
+       C_PIO_PCC_SOP_HEAD_PARITY_ERR,
+       C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
+       C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
+       C_PIO_RSVD_31_ERR,
+       C_PIO_RSVD_30_ERR,
+       C_PIO_PPMC_SOP_LEN_ERR,
+       C_PIO_PPMC_BQC_MEM_PARITY_ERR,
+       C_PIO_VL_FIFO_PARITY_ERR,
+       C_PIO_VLF_SOP_PARITY_ERR,
+       C_PIO_VLF_V1_LEN_PARITY_ERR,
+       C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
+       C_PIO_WRITE_QW_VALID_PARITY_ERR,
+       C_PIO_STATE_MACHINE_ERR,
+       C_PIO_WRITE_DATA_PARITY_ERR,
+       C_PIO_HOST_ADDR_MEM_COR_ERR,
+       C_PIO_HOST_ADDR_MEM_UNC_ERR,
+       C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
+       C_PIO_INIT_SM_IN_ERR,
+       C_PIO_PPMC_PBL_FIFO_ERR,
+       C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
+       C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
+       C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
+       C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
+       C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
+       C_PIO_SM_PKT_RESET_PARITY_ERR,
+       C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
+       C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
+       C_PIO_SBRDCTL_CRREL_PARITY_ERR,
+       C_PIO_PEC_FIFO_PARITY_ERR,
+       C_PIO_PCC_FIFO_PARITY_ERR,
+       C_PIO_SB_MEM_FIFO1_ERR,
+       C_PIO_SB_MEM_FIFO0_ERR,
+       C_PIO_CSR_PARITY_ERR,
+       C_PIO_WRITE_ADDR_PARITY_ERR,
+       C_PIO_WRITE_BAD_CTXT_ERR,
+/* SendDmaErrStatus */
+       C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
+       C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
+       C_SDMA_CSR_PARITY_ERR,
+       C_SDMA_RPY_TAG_ERR,
+/* SendEgressErrStatus */
+       C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
+       C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
+       C_TX_EGRESS_FIFO_COR_ERR,
+       C_TX_READ_PIO_MEMORY_COR_ERR,
+       C_TX_READ_SDMA_MEMORY_COR_ERR,
+       C_TX_SB_HDR_COR_ERR,
+       C_TX_CREDIT_OVERRUN_ERR,
+       C_TX_LAUNCH_FIFO8_COR_ERR,
+       C_TX_LAUNCH_FIFO7_COR_ERR,
+       C_TX_LAUNCH_FIFO6_COR_ERR,
+       C_TX_LAUNCH_FIFO5_COR_ERR,
+       C_TX_LAUNCH_FIFO4_COR_ERR,
+       C_TX_LAUNCH_FIFO3_COR_ERR,
+       C_TX_LAUNCH_FIFO2_COR_ERR,
+       C_TX_LAUNCH_FIFO1_COR_ERR,
+       C_TX_LAUNCH_FIFO0_COR_ERR,
+       C_TX_CREDIT_RETURN_VL_ERR,
+       C_TX_HCRC_INSERTION_ERR,
+       C_TX_EGRESS_FIFI_UNC_ERR,
+       C_TX_READ_PIO_MEMORY_UNC_ERR,
+       C_TX_READ_SDMA_MEMORY_UNC_ERR,
+       C_TX_SB_HDR_UNC_ERR,
+       C_TX_CREDIT_RETURN_PARITY_ERR,
+       C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
+       C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
+       C_TX_SDMA15_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA14_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA13_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA12_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA11_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA10_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA9_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA8_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA7_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA6_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA5_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA4_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA3_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA2_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA1_DISALLOWED_PACKET_ERR,
+       C_TX_SDMA0_DISALLOWED_PACKET_ERR,
+       C_TX_CONFIG_PARITY_ERR,
+       C_TX_SBRD_CTL_CSR_PARITY_ERR,
+       C_TX_LAUNCH_CSR_PARITY_ERR,
+       C_TX_ILLEGAL_CL_ERR,
+       C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
+       C_TX_RESERVED_10,
+       C_TX_RESERVED_9,
+       C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
+       C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
+       C_TX_RESERVED_6,
+       C_TX_INCORRECT_LINK_STATE_ERR,
+       C_TX_LINK_DOWN_ERR,
+       C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
+       C_TX_RESERVED_2,
+       C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
+       C_TX_PKT_INTEGRITY_MEM_COR_ERR,
+/* SendErrStatus */
+       C_SEND_CSR_WRITE_BAD_ADDR_ERR,
+       C_SEND_CSR_READ_BAD_ADD_ERR,
+       C_SEND_CSR_PARITY_ERR,
+/* SendCtxtErrStatus */
+       C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
+       C_PIO_WRITE_OVERFLOW_ERR,
+       C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
+       C_PIO_DISALLOWED_PACKET_ERR,
+       C_PIO_INCONSISTENT_SOP_ERR,
+/*SendDmaEngErrStatus */
+       C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
+       C_SDMA_HEADER_STORAGE_COR_ERR,
+       C_SDMA_PACKET_TRACKING_COR_ERR,
+       C_SDMA_ASSEMBLY_COR_ERR,
+       C_SDMA_DESC_TABLE_COR_ERR,
+       C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
+       C_SDMA_HEADER_STORAGE_UNC_ERR,
+       C_SDMA_PACKET_TRACKING_UNC_ERR,
+       C_SDMA_ASSEMBLY_UNC_ERR,
+       C_SDMA_DESC_TABLE_UNC_ERR,
+       C_SDMA_TIMEOUT_ERR,
+       C_SDMA_HEADER_LENGTH_ERR,
+       C_SDMA_HEADER_ADDRESS_ERR,
+       C_SDMA_HEADER_SELECT_ERR,
+       C_SMDA_RESERVED_9,
+       C_SDMA_PACKET_DESC_OVERFLOW_ERR,
+       C_SDMA_LENGTH_MISMATCH_ERR,
+       C_SDMA_HALT_ERR,
+       C_SDMA_MEM_READ_ERR,
+       C_SDMA_FIRST_DESC_ERR,
+       C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
+       C_SDMA_TOO_LONG_ERR,
+       C_SDMA_GEN_MISMATCH_ERR,
+       C_SDMA_WRONG_DW_ERR,
+       DEV_CNTR_LAST  /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+       C_TX_UNSUP_VL = 0,
+       C_TX_INVAL_LEN,
+       C_TX_MM_LEN_ERR,
+       C_TX_UNDERRUN,
+       C_TX_FLOW_STALL,
+       C_TX_DROPPED,
+       C_TX_HDR_ERR,
+       C_TX_PKT,
+       C_TX_WORDS,
+       C_TX_WAIT,
+       C_TX_FLIT_VL,
+       C_TX_PKT_VL,
+       C_TX_WAIT_VL,
+       C_RX_PKT,
+       C_RX_WORDS,
+       C_SW_LINK_DOWN,
+       C_SW_LINK_UP,
+       C_SW_UNKNOWN_FRAME,
+       C_SW_XMIT_DSCD,
+       C_SW_XMIT_DSCD_VL,
+       C_SW_XMIT_CSTR_ERR,
+       C_SW_RCV_CSTR_ERR,
+       C_SW_IBP_LOOP_PKTS,
+       C_SW_IBP_RC_RESENDS,
+       C_SW_IBP_RNR_NAKS,
+       C_SW_IBP_OTHER_NAKS,
+       C_SW_IBP_RC_TIMEOUTS,
+       C_SW_IBP_PKT_DROPS,
+       C_SW_IBP_DMA_WAIT,
+       C_SW_IBP_RC_SEQNAK,
+       C_SW_IBP_RC_DUPREQ,
+       C_SW_IBP_RDMA_SEQ,
+       C_SW_IBP_UNALIGNED,
+       C_SW_IBP_SEQ_NAK,
+       C_SW_CPU_RC_ACKS,
+       C_SW_CPU_RC_QACKS,
+       C_SW_CPU_RC_DELAYED_COMP,
+       C_RCV_HDR_OVF_0,
+       C_RCV_HDR_OVF_1,
+       C_RCV_HDR_OVF_2,
+       C_RCV_HDR_OVF_3,
+       C_RCV_HDR_OVF_4,
+       C_RCV_HDR_OVF_5,
+       C_RCV_HDR_OVF_6,
+       C_RCV_HDR_OVF_7,
+       C_RCV_HDR_OVF_8,
+       C_RCV_HDR_OVF_9,
+       C_RCV_HDR_OVF_10,
+       C_RCV_HDR_OVF_11,
+       C_RCV_HDR_OVF_12,
+       C_RCV_HDR_OVF_13,
+       C_RCV_HDR_OVF_14,
+       C_RCV_HDR_OVF_15,
+       C_RCV_HDR_OVF_16,
+       C_RCV_HDR_OVF_17,
+       C_RCV_HDR_OVF_18,
+       C_RCV_HDR_OVF_19,
+       C_RCV_HDR_OVF_20,
+       C_RCV_HDR_OVF_21,
+       C_RCV_HDR_OVF_22,
+       C_RCV_HDR_OVF_23,
+       C_RCV_HDR_OVF_24,
+       C_RCV_HDR_OVF_25,
+       C_RCV_HDR_OVF_26,
+       C_RCV_HDR_OVF_27,
+       C_RCV_HDR_OVF_28,
+       C_RCV_HDR_OVF_29,
+       C_RCV_HDR_OVF_30,
+       C_RCV_HDR_OVF_31,
+       C_RCV_HDR_OVF_32,
+       C_RCV_HDR_OVF_33,
+       C_RCV_HDR_OVF_34,
+       C_RCV_HDR_OVF_35,
+       C_RCV_HDR_OVF_36,
+       C_RCV_HDR_OVF_37,
+       C_RCV_HDR_OVF_38,
+       C_RCV_HDR_OVF_39,
+       C_RCV_HDR_OVF_40,
+       C_RCV_HDR_OVF_41,
+       C_RCV_HDR_OVF_42,
+       C_RCV_HDR_OVF_43,
+       C_RCV_HDR_OVF_44,
+       C_RCV_HDR_OVF_45,
+       C_RCV_HDR_OVF_46,
+       C_RCV_HDR_OVF_47,
+       C_RCV_HDR_OVF_48,
+       C_RCV_HDR_OVF_49,
+       C_RCV_HDR_OVF_50,
+       C_RCV_HDR_OVF_51,
+       C_RCV_HDR_OVF_52,
+       C_RCV_HDR_OVF_53,
+       C_RCV_HDR_OVF_54,
+       C_RCV_HDR_OVF_55,
+       C_RCV_HDR_OVF_56,
+       C_RCV_HDR_OVF_57,
+       C_RCV_HDR_OVF_58,
+       C_RCV_HDR_OVF_59,
+       C_RCV_HDR_OVF_60,
+       C_RCV_HDR_OVF_61,
+       C_RCV_HDR_OVF_62,
+       C_RCV_HDR_OVF_63,
+       C_RCV_HDR_OVF_64,
+       C_RCV_HDR_OVF_65,
+       C_RCV_HDR_OVF_66,
+       C_RCV_HDR_OVF_67,
+       C_RCV_HDR_OVF_68,
+       C_RCV_HDR_OVF_69,
+       C_RCV_HDR_OVF_70,
+       C_RCV_HDR_OVF_71,
+       C_RCV_HDR_OVF_72,
+       C_RCV_HDR_OVF_73,
+       C_RCV_HDR_OVF_74,
+       C_RCV_HDR_OVF_75,
+       C_RCV_HDR_OVF_76,
+       C_RCV_HDR_OVF_77,
+       C_RCV_HDR_OVF_78,
+       C_RCV_HDR_OVF_79,
+       C_RCV_HDR_OVF_80,
+       C_RCV_HDR_OVF_81,
+       C_RCV_HDR_OVF_82,
+       C_RCV_HDR_OVF_83,
+       C_RCV_HDR_OVF_84,
+       C_RCV_HDR_OVF_85,
+       C_RCV_HDR_OVF_86,
+       C_RCV_HDR_OVF_87,
+       C_RCV_HDR_OVF_88,
+       C_RCV_HDR_OVF_89,
+       C_RCV_HDR_OVF_90,
+       C_RCV_HDR_OVF_91,
+       C_RCV_HDR_OVF_92,
+       C_RCV_HDR_OVF_93,
+       C_RCV_HDR_OVF_94,
+       C_RCV_HDR_OVF_95,
+       C_RCV_HDR_OVF_96,
+       C_RCV_HDR_OVF_97,
+       C_RCV_HDR_OVF_98,
+       C_RCV_HDR_OVF_99,
+       C_RCV_HDR_OVF_100,
+       C_RCV_HDR_OVF_101,
+       C_RCV_HDR_OVF_102,
+       C_RCV_HDR_OVF_103,
+       C_RCV_HDR_OVF_104,
+       C_RCV_HDR_OVF_105,
+       C_RCV_HDR_OVF_106,
+       C_RCV_HDR_OVF_107,
+       C_RCV_HDR_OVF_108,
+       C_RCV_HDR_OVF_109,
+       C_RCV_HDR_OVF_110,
+       C_RCV_HDR_OVF_111,
+       C_RCV_HDR_OVF_112,
+       C_RCV_HDR_OVF_113,
+       C_RCV_HDR_OVF_114,
+       C_RCV_HDR_OVF_115,
+       C_RCV_HDR_OVF_116,
+       C_RCV_HDR_OVF_117,
+       C_RCV_HDR_OVF_118,
+       C_RCV_HDR_OVF_119,
+       C_RCV_HDR_OVF_120,
+       C_RCV_HDR_OVF_121,
+       C_RCV_HDR_OVF_122,
+       C_RCV_HDR_OVF_123,
+       C_RCV_HDR_OVF_124,
+       C_RCV_HDR_OVF_125,
+       C_RCV_HDR_OVF_126,
+       C_RCV_HDR_OVF_127,
+       C_RCV_HDR_OVF_128,
+       C_RCV_HDR_OVF_129,
+       C_RCV_HDR_OVF_130,
+       C_RCV_HDR_OVF_131,
+       C_RCV_HDR_OVF_132,
+       C_RCV_HDR_OVF_133,
+       C_RCV_HDR_OVF_134,
+       C_RCV_HDR_OVF_135,
+       C_RCV_HDR_OVF_136,
+       C_RCV_HDR_OVF_137,
+       C_RCV_HDR_OVF_138,
+       C_RCV_HDR_OVF_139,
+       C_RCV_HDR_OVF_140,
+       C_RCV_HDR_OVF_141,
+       C_RCV_HDR_OVF_142,
+       C_RCV_HDR_OVF_143,
+       C_RCV_HDR_OVF_144,
+       C_RCV_HDR_OVF_145,
+       C_RCV_HDR_OVF_146,
+       C_RCV_HDR_OVF_147,
+       C_RCV_HDR_OVF_148,
+       C_RCV_HDR_OVF_149,
+       C_RCV_HDR_OVF_150,
+       C_RCV_HDR_OVF_151,
+       C_RCV_HDR_OVF_152,
+       C_RCV_HDR_OVF_153,
+       C_RCV_HDR_OVF_154,
+       C_RCV_HDR_OVF_155,
+       C_RCV_HDR_OVF_156,
+       C_RCV_HDR_OVF_157,
+       C_RCV_HDR_OVF_158,
+       C_RCV_HDR_OVF_159,
+       PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo);
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type".  It is ordered by increasing
+ * number.
+ */
+struct is_table {
+       int start;       /* interrupt source type start */
+       int end;         /* interrupt source type end */
+       /* routine that returns the name of the interrupt source */
+       char *(*is_name)(char *name, size_t size, unsigned int source);
+       /* routine to call when receiving an interrupt */
+       void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
new file mode 100644 (file)
index 0000000..8744de6
--- /dev/null
@@ -0,0 +1,1307 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE           0x000000000000
+#define CCE                    (CORE + 0x000000000000)
+#define ASIC           (CORE + 0x000000400000)
+#define MISC           (CORE + 0x000000500000)
+#define DC_TOP_CSRS            (CORE + 0x000000600000)
+#define CHIP_DEBUG             (CORE + 0x000000700000)
+#define RXE                    (CORE + 0x000001000000)
+#define TXE                    (CORE + 0x000001800000)
+#define DCC_CSRS               (DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS            (DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS           (DC_TOP_CSRS + 0x000000002000)
+#define PCIE           0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
+#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
+#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+               0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+               0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+               0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
+#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
+#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+               0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+               0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+               0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+               0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+               0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+               0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+               0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+               0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+               0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+               0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+               0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+               0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+               0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+               0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+               0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+               0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+               0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+               0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+               0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+               0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+               0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+               0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+               0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+               0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+               0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+               0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+               0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+               0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+               0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+               0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+               0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+               0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+               0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+               0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+               0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+               0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+               0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+               0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+               0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+               0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+               0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+               0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+               0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+               0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+               0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+               0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+               0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+               0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+               0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+               0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+               0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+               0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+               0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+               0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+               0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+               0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+               0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+#define CCE_MSIX_PBA_OFFSET 0X0110000
+
+#endif          /* DEF_CHIP_REG */
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
new file mode 100644 (file)
index 0000000..fcc9c21
--- /dev/null
@@ -0,0 +1,411 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+                                          HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~HFI1_CAP_##cap;                       \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_USET(cap)                                             \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+               })
+#define HFI1_CAP_UCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_SET(cap)                                              \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<   \
+                                                 HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_CLEAR(cap)                                            \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap |                     \
+                                 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_LOCK()                                                        \
+       ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |                  \
+                                 HFI1_CAP_HDRSUPP |                    \
+                                 HFI1_CAP_MULTI_PKT_EGR |              \
+                                 HFI1_CAP_NODROP_RHQ_FULL |            \
+                                 HFI1_CAP_NODROP_EGR_FULL |            \
+                                 HFI1_CAP_ALLOW_PERM_JKEY |            \
+                                 HFI1_CAP_STATIC_RATE_CTRL |           \
+                                 HFI1_CAP_PRINT_UNIMPL |               \
+                                 HFI1_CAP_TID_UNMAP)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |                     \
+                                 HFI1_CAP_USE_SDMA_HEAD |              \
+                                 HFI1_CAP_EXTENDED_PSN |               \
+                                 HFI1_CAP_PRINT_UNIMPL |               \
+                                 HFI1_CAP_NO_INTEGRITY |               \
+                                 HFI1_CAP_PKEY_CHECK) <<               \
+                                HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |                   \
+                                HFI1_CAP_NODROP_RHQ_FULL |             \
+                                HFI1_CAP_NODROP_EGR_FULL |             \
+                                HFI1_CAP_SDMA |                        \
+                                HFI1_CAP_PRINT_UNIMPL |                \
+                                HFI1_CAP_STATIC_RATE_CTRL |            \
+                                HFI1_CAP_PKEY_CHECK |                  \
+                                HFI1_CAP_MULTI_PKT_EGR |               \
+                                HFI1_CAP_EXTENDED_PSN |                \
+                                ((HFI1_CAP_HDRSUPP |                   \
+                                  HFI1_CAP_MULTI_PKT_EGR |             \
+                                  HFI1_CAP_STATIC_RATE_CTRL |          \
+                                  HFI1_CAP_PKEY_CHECK |                \
+                                  HFI1_CAP_EARLY_CREDIT_RETURN) <<     \
+                                 HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |                  \
+                    HFI1_CAP_EXTENDED_PSN |            \
+                    HFI1_CAP_PKEY_CHECK |              \
+                    HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+                            HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-294"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+       __u16 version;          /* structure version */
+       __u16 unit;             /* which device */
+       __u16 sw_index;         /* send sw index to use */
+       __u16 len;              /* data length, in bytes */
+       __u16 port;             /* port number */
+       __u16 unused;
+       __u32 flags;            /* call flags */
+       __u64 data;             /* user data pointer */
+       __u64 pbc;              /* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1     /* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT      0
+#define RHF_PKT_LEN_MASK       0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT     12
+#define RHF_RCV_TYPE_MASK      0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT  15
+#define RHF_USE_EGR_BFR_MASK   0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT    16
+#define RHF_EGR_INDEX_MASK     0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT      27
+#define RHF_DC_INFO_MASK       0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT      28
+#define RHF_RCV_SEQ_MASK       0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT   32
+#define RHF_EGR_OFFSET_MASK    0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT  44
+#define RHF_HDRQ_OFFSET_MASK   0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR      (0x1ull << 53)
+#define RHF_DC_UNC_ERR         (0x1ull << 54)
+#define RHF_DC_ERR             (0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT 56
+#define RHF_RCV_TYPE_ERR_MASK  0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR            (0x1ull << 59)
+#define RHF_LEN_ERR            (0x1ull << 60)
+#define RHF_ECC_ERR            (0x1ull << 61)
+#define RHF_VCRC_ERR           (0x1ull << 62)
+#define RHF_ICRC_ERR           (0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR  0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR  0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR           0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR              0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR           0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR      0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR    0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR    0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR      0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR     0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR          0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+       __be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
+
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+       return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+       return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+       return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+       return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+       return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+       return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+       return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+       return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+       return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
new file mode 100644 (file)
index 0000000..dbab9d9
--- /dev/null
@@ -0,0 +1,1145 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+       .start = _##name##_seq_start, \
+       .next  = _##name##_seq_next, \
+       .stop  = _##name##_seq_stop, \
+       .show  = _##name##_seq_show \
+}
+
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+       struct seq_file *seq; \
+       int ret; \
+       ret =  seq_open(s, &_##name##_seq_ops); \
+       if (ret) \
+               return ret; \
+       seq = s->private_data; \
+       seq->private = inode->i_private; \
+       return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+       .owner   = THIS_MODULE, \
+       .open    = _##name##_open, \
+       .read    = seq_read, \
+       .llseek  = seq_lseek, \
+       .release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)     \
+do { \
+       struct dentry *ent; \
+       ent = debugfs_create_file(name, mode, parent, \
+               data, ops); \
+       if (!ent) \
+               pr_warn("create of %s failed\n", name); \
+} while (0)
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+       DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       loff_t i = *spos, j;
+       u64 n_packets = 0, n_bytes = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       for (j = 0; j < dd->first_user_ctxt; j++) {
+               if (!dd->rcd[j])
+                       continue;
+               n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+               n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+       }
+       if (!n_packets && !n_bytes)
+               return SEQ_SKIP;
+       seq_printf(s, "%02llx %llu/%llu\n", i,
+                  (unsigned long long)n_packets,
+                  (unsigned long long)n_bytes);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (!*pos)
+               return SEQ_START_TOKEN;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN)
+               return pos;
+
+       ++*pos;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+       /* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos;
+       loff_t i, j;
+       u64 n_packets = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN) {
+               seq_puts(s, "Ctx:npkts\n");
+               return 0;
+       }
+
+       spos = v;
+       i = *spos;
+
+       if (!dd->rcd[i])
+               return SEQ_SKIP;
+
+       for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+               n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+       if (!n_packets)
+               return SEQ_SKIP;
+
+       seq_printf(s, "  %llu:%llu\n", i, n_packets);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct qp_iter *iter;
+       loff_t n = *pos;
+
+       rcu_read_lock();
+       iter = qp_iter_init(s->private);
+       if (!iter)
+               return NULL;
+
+       while (n--) {
+               if (qp_iter_next(iter)) {
+                       kfree(iter);
+                       return NULL;
+               }
+       }
+
+       return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+                               loff_t *pos)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       (*pos)++;
+
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       if (!iter)
+               return 0;
+
+       qp_iter_print(s, iter);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_ibdev *ibd;
+       struct hfi1_devdata *dd;
+
+       rcu_read_lock();
+       ibd = (struct hfi1_ibdev *)s->private;
+       dd = dd_from_dev(ibd);
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       ++*pos;
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       loff_t *spos = v;
+       loff_t i = *spos;
+
+       sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, NULL, &counters);
+       rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, &names, NULL);
+       rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+struct counter_info {
+       char *name;
+       const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
+       rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_pportdata *ppd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       avail = hfi1_read_portcntrs(ppd, NULL, &counters);
+       rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
+                          int this_hfi, int hfi, u32 flag, const char *what)
+{
+       u32 mask;
+
+       mask = flag << (hfi ? CR_DYN_SHIFT : 0);
+       if (scratch0 & mask) {
+               *used += scnprintf(p + *used, size - *used,
+                                  "  0x%08x - HFI%d %s in use, %s device\n",
+                                  mask, hfi, what,
+                                  this_hfi == hfi ? "this" : "other");
+       }
+}
+
+static ssize_t asic_flags_read(struct file *file, char __user *buf,
+                              size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u64 scratch0;
+       char *tmp;
+       int ret = 0;
+       int size;
+       int used;
+       int i;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       dd = ppd->dd;
+       size = PAGE_SIZE;
+       used = 0;
+       tmp = kmalloc(size, GFP_KERNEL);
+       if (!tmp) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       used += scnprintf(tmp + used, size - used,
+                         "Resource flags: 0x%016llx\n", scratch0);
+
+       /* check permanent flag */
+       if (scratch0 & CR_THERM_INIT) {
+               used += scnprintf(tmp + used, size - used,
+                                 "  0x%08x - thermal monitoring initialized\n",
+                                 (u32)CR_THERM_INIT);
+       }
+
+       /* check each dynamic flag on each HFI */
+       for (i = 0; i < 2; i++) {
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_SBUS, "SBus");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_EPROM, "EPROM");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_I2C1, "i2c chain 1");
+               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+                              CR_I2C2, "i2c chain 2");
+       }
+       used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
+
+       ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
+       rcu_read_unlock();
+       kfree(tmp);
+       return ret;
+}
+
+static ssize_t asic_flags_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       char *buff;
+       int ret;
+       unsigned long long value;
+       u64 scratch0;
+       u64 clear;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       dd = ppd->dd;
+
+       buff = kmalloc(count + 1, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto do_return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto do_free;
+       }
+
+       /* zero terminate and read the expected integer */
+       buff[count] = 0;
+       ret = kstrtoull(buff, 0, &value);
+       if (ret)
+               goto do_free;
+       clear = value;
+
+       /* obtain exclusive access */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+       acquire_hw_mutex(dd);
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       scratch0 &= ~clear;
+       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+       /* force write to be visible to other HFI on another OS */
+       (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+       release_hw_mutex(dd);
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+
+       /* return the number of bytes written */
+       ret = count;
+
+ do_free:
+       kfree(buff);
+ do_return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       char *tmp;
+       int ret;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!tmp) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+
+       ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+       if (ret > 0)
+               ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+       rcu_read_unlock();
+       kfree(tmp);
+       return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_written;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+       i2c_addr = (*ppos >> 16) & 0xffff;
+       offset = *ppos & 0xffff;
+
+       /* explicitly reject invalid address 0 to catch cp and cat */
+       if (i2c_addr == 0) {
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_read;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+       i2c_addr = (*ppos >> 16) & 0xffff;
+       offset = *ppos & 0xffff;
+
+       /* explicitly reject invalid address 0 to catch cp and cat */
+       if (i2c_addr == 0) {
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_written;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       total_written = qsfp_write(ppd, target, *ppos, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+                                  size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_read;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       total_read = qsfp_read(ppd, target, *ppos, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       int ret;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       ppd = private2ppd(fp);
+
+       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+       if (ret) /* failed - release the module */
+               module_put(THIS_MODULE);
+
+       return ret;
+}
+
+static int i2c1_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_open(in, fp, 0);
+}
+
+static int i2c2_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_open(in, fp, 1);
+}
+
+static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+
+       ppd = private2ppd(fp);
+
+       release_chip_resource(ppd->dd, i2c_target(target));
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static int i2c1_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_release(in, fp, 0);
+}
+
+static int i2c2_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __i2c_debugfs_release(in, fp, 1);
+}
+
+static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       int ret;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       ppd = private2ppd(fp);
+
+       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+       if (ret) /* failed - release the module */
+               module_put(THIS_MODULE);
+
+       return ret;
+}
+
+static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_open(in, fp, 0);
+}
+
+static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_open(in, fp, 1);
+}
+
+static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+
+       ppd = private2ppd(fp);
+
+       release_chip_resource(ppd->dd, i2c_target(target));
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_release(in, fp, 0);
+}
+
+static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
+{
+       return __qsfp_debugfs_release(in, fp, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
+{ \
+       .name = nm, \
+       .ops = { \
+               .read = readroutine, \
+               .write = writeroutine, \
+               .llseek = generic_file_llseek, \
+       }, \
+}
+
+#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
+{ \
+       .name = nm, \
+       .ops = { \
+               .read = readf, \
+               .write = writef, \
+               .llseek = generic_file_llseek, \
+               .open = openf, \
+               .release = releasef \
+       }, \
+}
+
+static const struct counter_info cntr_ops[] = {
+       DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+       DEBUGFS_OPS("counters", dev_counters_read, NULL),
+       DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+       DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+       DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
+                    i2c1_debugfs_open, i2c1_debugfs_release),
+       DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
+                    i2c2_debugfs_open, i2c2_debugfs_release),
+       DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+       DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
+                    qsfp1_debugfs_open, qsfp1_debugfs_release),
+       DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
+                    qsfp2_debugfs_open, qsfp2_debugfs_release),
+       DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+       char name[sizeof("port0counters") + 1];
+       char link[10];
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       struct hfi1_pportdata *ppd;
+       int unit = dd->unit;
+       int i, j;
+
+       if (!hfi1_dbg_root)
+               return;
+       snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+       snprintf(link, sizeof(link), "%d", unit);
+       ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+       if (!ibd->hfi1_ibdev_dbg) {
+               pr_warn("create of %s failed\n", name);
+               return;
+       }
+       ibd->hfi1_ibdev_link =
+               debugfs_create_symlink(link, hfi1_dbg_root, name);
+       if (!ibd->hfi1_ibdev_link) {
+               pr_warn("create of %s symlink failed\n", name);
+               return;
+       }
+       DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+       /* dev counter files */
+       for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+               DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+                                   ibd->hfi1_ibdev_dbg,
+                                   dd,
+                                   &cntr_ops[i].ops, S_IRUGO);
+       /* per port files */
+       for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+               for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+                       snprintf(name,
+                                sizeof(name),
+                                port_cntr_ops[i].name,
+                                j + 1);
+                       DEBUGFS_FILE_CREATE(name,
+                                           ibd->hfi1_ibdev_dbg,
+                                           ppd,
+                                           &port_cntr_ops[i].ops,
+                                           !port_cntr_ops[i].ops.write ?
+                                           S_IRUGO : S_IRUGO | S_IWUSR);
+               }
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+       if (!hfi1_dbg_root)
+               goto out;
+       debugfs_remove(ibd->hfi1_ibdev_link);
+       debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+       ibd->hfi1_ibdev_dbg = NULL;
+       synchronize_rcu();
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+       /* must be element 0*/
+       "KernIntr",
+       "ErrorIntr",
+       "Tx_Errs",
+       "Rcv_Errs",
+       "H/W_Errs",
+       "NoPIOBufs",
+       "CtxtsOpen",
+       "RcvLen_Errs",
+       "EgrBufFull",
+       "EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+       struct seq_file *s,
+       void *v,
+       loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+
+       seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static u64 hfi1_sps_ints(void)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       u64 sps_ints = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               sps_ints += get_all_cpu_total(dd->int_counter);
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       char *buffer;
+       u64 *stats = (u64 *)&hfi1_stats;
+       size_t sz = seq_get_buf(s, &buffer);
+
+       if (sz < sizeof(u64))
+               return SEQ_SKIP;
+       /* special case for interrupts */
+       if (*spos == 0)
+               *(u64 *)buffer = hfi1_sps_ints();
+       else
+               *(u64 *)buffer = stats[*spos];
+       seq_commit(s,  sizeof(u64));
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+       hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
+       if (!hfi1_dbg_root)
+               pr_warn("init of debugfs failed\n");
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+       debugfs_remove_recursive(hfi1_dbg_root);
+       hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
new file mode 100644 (file)
index 0000000..b6fb681
--- /dev/null
@@ -0,0 +1,75 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
new file mode 100644 (file)
index 0000000..bf64b5a
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static struct class *user_class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp,
+                  bool user_accessible,
+                  struct kobject *parent)
+{
+       const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+       struct device *device = NULL;
+       int ret;
+
+       cdev_init(cdev, fops);
+       cdev->owner = THIS_MODULE;
+       cdev->kobj.parent = parent;
+       kobject_set_name(&cdev->kobj, name);
+
+       ret = cdev_add(cdev, dev, 1);
+       if (ret < 0) {
+               pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto done;
+       }
+
+       if (user_accessible)
+               device = device_create(user_class, NULL, dev, NULL, "%s", name);
+       else
+               device = device_create(class, NULL, dev, NULL, "%s", name);
+
+       if (IS_ERR(device)) {
+               ret = PTR_ERR(device);
+               device = NULL;
+               pr_err("Could not create device for minor %d, %s (err %d)\n",
+                       minor, name, -ret);
+               cdev_del(cdev);
+       }
+done:
+       *devp = device;
+       return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+       struct device *device = *devp;
+
+       if (device) {
+               device_unregister(device);
+               *devp = NULL;
+
+               cdev_del(cdev);
+       }
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+       return hfi1_class_name;
+}
+
+static char *hfi1_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode)
+               *mode = 0600;
+       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+static const char *hfi1_class_name_user = "hfi1_user";
+static const char *class_name_user(void)
+{
+       return hfi1_class_name_user;
+}
+
+static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
+{
+       if (mode)
+               *mode = 0666;
+       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+int __init dev_init(void)
+{
+       int ret;
+
+       ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+       if (ret < 0) {
+               pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+               goto done;
+       }
+
+       class = class_create(THIS_MODULE, class_name());
+       if (IS_ERR(class)) {
+               ret = PTR_ERR(class);
+               pr_err("Could not create device class (err %d)\n", -ret);
+               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+               goto done;
+       }
+       class->devnode = hfi1_devnode;
+
+       user_class = class_create(THIS_MODULE, class_name_user());
+       if (IS_ERR(user_class)) {
+               ret = PTR_ERR(user_class);
+               pr_err("Could not create device class for user accessible files (err %d)\n",
+                      -ret);
+               class_destroy(class);
+               class = NULL;
+               user_class = NULL;
+               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+               goto done;
+       }
+       user_class->devnode = hfi1_user_devnode;
+
+done:
+       return ret;
+}
+
+void dev_cleanup(void)
+{
+       class_destroy(class);
+       class = NULL;
+
+       class_destroy(user_class);
+       user_class = NULL;
+
+       unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h
new file mode 100644 (file)
index 0000000..c3ec19c
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp,
+                  bool user_accessible,
+                  struct kobject *parent);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/infiniband/hw/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c
new file mode 100644 (file)
index 0000000..7e8dab8
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64)0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+                              size_t size, enum dma_data_direction direction)
+{
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       return (u64)cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+                                 enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+                            unsigned long offset, size_t size,
+                           enum dma_data_direction direction)
+{
+       u64 addr;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       if (offset + size > PAGE_SIZE)
+               return BAD_DMA_ADDRESS;
+
+       addr = (u64)page_address(page);
+       if (addr)
+               addr += offset;
+
+       return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                      int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (u64)page_address(sg_page(sg));
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+       return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+                         struct scatterlist *sg, int nents,
+                        enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+                                    size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+                                       size_t size,
+                                       enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                    u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+       if (dma_handle)
+               *dma_handle = (u64)addr;
+       return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+                                  void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+       .mapping_error = hfi1_mapping_error,
+       .map_single = hfi1_dma_map_single,
+       .unmap_single = hfi1_dma_unmap_single,
+       .map_page = hfi1_dma_map_page,
+       .unmap_page = hfi1_dma_unmap_page,
+       .map_sg = hfi1_map_sg,
+       .unmap_sg = hfi1_unmap_sg,
+       .sync_single_for_cpu = hfi1_sync_single_for_cpu,
+       .sync_single_for_device = hfi1_sync_single_for_device,
+       .alloc_coherent = hfi1_dma_alloc_coherent,
+       .free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
new file mode 100644 (file)
index 0000000..c75b0ae
--- /dev/null
@@ -0,0 +1,1404 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+                HFI1_DEFAULT_MAX_MTU));
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+       .set = hfi1_caps_set,
+       .get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+       int ret = 0;
+       unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+               cap_mask = *cap_mask_ptr, value, diff,
+               write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+                             HFI1_CAP_WRITABLE_MASK);
+
+       ret = kstrtoul(val, 0, &value);
+       if (ret) {
+               pr_warn("Invalid module parameter value for 'cap_mask'\n");
+               goto done;
+       }
+       /* Get the changed bits (except the locked bit) */
+       diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+       /* Remove any bits that are not allowed to change after driver load */
+       if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+               pr_warn("Ignoring non-writable capability bits %#lx\n",
+                       diff & ~write_mask);
+               diff &= write_mask;
+       }
+
+       /* Mask off any reserved bits */
+       diff &= ~HFI1_CAP_RESERVED_MASK;
+       /* Clear any previously set and changing bits */
+       cap_mask &= ~diff;
+       /* Update the bits with the new capability */
+       cap_mask |= (value & diff);
+       /* Check for any kernel/user restrictions */
+       diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+               ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+       cap_mask &= ~diff;
+       /* Set the bitmask to the final set */
+       *cap_mask_ptr = cap_mask;
+done:
+       return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+       unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+       cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+       cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+       return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+       static char iname[16];
+
+       snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
+       return iname;
+}
+
+const char *get_card_name(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return get_unit_name(dd->unit);
+}
+
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+       struct hfi1_devdata *dd = container_of(ibdev,
+                                              struct hfi1_devdata, verbs_dev);
+       return dd->pcidev;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx, nunits_active = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+                       continue;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup) {
+                               nunits_active++;
+                               break;
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+       int nunits = 0, npresent = 0, nup = 0;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               nunits++;
+               if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+                       npresent++;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup)
+                               nup++;
+               }
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       if (npresentp)
+               *npresentp = npresent;
+       if (nupp)
+               *nupp = nup;
+
+       return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+                              u8 *update)
+{
+       u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+       *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+       return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+                       (offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+       if (unlikely(!PAGE_ALIGNED(size)))
+               return 0;
+       if (unlikely(size < MIN_EAGER_BUFFER))
+               return 0;
+       if (size >
+           (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+               return 0;
+       if (encoded)
+               *encoded = ilog2(size / PAGE_SIZE) + 1;
+       return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+                      struct hfi1_packet *packet)
+{
+       struct hfi1_message_header *rhdr = packet->hdr;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+       int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       struct hfi1_devdata *dd = ppd->dd;
+       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+               return;
+
+       if (packet->rhf & RHF_TID_ERR) {
+               /* For TIDERR and RC QPs preemptively schedule a NAK */
+               struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+               struct hfi1_other_headers *ohdr = NULL;
+               u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+               u16 lid  = be16_to_cpu(hdr->lrh[1]);
+               u32 qp_num;
+               u32 rcv_flags = 0;
+
+               /* Sanity check packet */
+               if (tlen < 24)
+                       goto drop;
+
+               /* Check for GRH */
+               if (lnh == HFI1_LRH_BTH) {
+                       ohdr = &hdr->u.oth;
+               } else if (lnh == HFI1_LRH_GRH) {
+                       u32 vtf;
+
+                       ohdr = &hdr->u.l.oth;
+                       if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                               goto drop;
+                       vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+                       if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                               goto drop;
+                       rcv_flags |= HFI1_HAS_GRH;
+               } else {
+                       goto drop;
+               }
+               /* Get the destination QP number. */
+               qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+                       struct rvt_qp *qp;
+                       unsigned long flags;
+
+                       rcu_read_lock();
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+                       if (!qp) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       /*
+                        * Handle only RC QPs - for other QP types drop error
+                        * packet.
+                        */
+                       spin_lock_irqsave(&qp->r_lock, flags);
+
+                       /* Check for valid receive state. */
+                       if (!(ib_rvt_state_ops[qp->state] &
+                             RVT_PROCESS_RECV_OK)) {
+                               ibp->rvp.n_pkt_drops++;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_RC:
+                               hfi1_rc_hdrerr(
+                                       rcd,
+                                       hdr,
+                                       rcv_flags,
+                                       qp);
+                               break;
+                       default:
+                               /* For now don't handle any other QP types */
+                               break;
+                       }
+
+                       spin_unlock_irqrestore(&qp->r_lock, flags);
+                       rcu_read_unlock();
+               } /* Unicast QP */
+       } /* Valid packet with TIDErr */
+
+       /* handle "RcvTypeErr" flags */
+       switch (rte) {
+       case RHF_RTE_ERROR_OP_CODE_ERR:
+       {
+               u32 opcode;
+               void *ebuf = NULL;
+               __be32 *bth = NULL;
+
+               if (rhf_use_egr_bfr(packet->rhf))
+                       ebuf = packet->ebuf;
+
+               if (!ebuf)
+                       goto drop; /* this should never happen */
+
+               if (lnh == HFI1_LRH_BTH)
+                       bth = (__be32 *)ebuf;
+               else if (lnh == HFI1_LRH_GRH)
+                       bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+               else
+                       goto drop;
+
+               opcode = be32_to_cpu(bth[0]) >> 24;
+               opcode &= 0xff;
+
+               if (opcode == IB_OPCODE_CNP) {
+                       /*
+                        * Only in pre-B0 h/w is the CNP_OPCODE handled
+                        * via this code path.
+                        */
+                       struct rvt_qp *qp = NULL;
+                       u32 lqpn, rqpn;
+                       u16 rlid;
+                       u8 svc_type, sl, sc5;
+
+                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
+                       if (rhf_dc_info(packet->rhf))
+                               sc5 |= 0x10;
+                       sl = ibp->sc_to_sl[sc5];
+
+                       lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
+                       rcu_read_lock();
+                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
+                       if (!qp) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_UD:
+                               rlid = 0;
+                               rqpn = 0;
+                               svc_type = IB_CC_SVCTYPE_UD;
+                               break;
+                       case IB_QPT_UC:
+                               rlid = be16_to_cpu(rhdr->lrh[3]);
+                               rqpn = qp->remote_qpn;
+                               svc_type = IB_CC_SVCTYPE_UC;
+                               break;
+                       default:
+                               goto drop;
+                       }
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+                       rcu_read_unlock();
+               }
+
+               packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+               break;
+       }
+       default:
+               break;
+       }
+
+drop:
+       return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+                              struct hfi1_packet *packet)
+{
+       packet->rsize = rcd->rcvhdrqentsize; /* words */
+       packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+       packet->rcd = rcd;
+       packet->updegr = 0;
+       packet->etail = -1;
+       packet->rhf_addr = get_rhf_addr(rcd);
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+       packet->rhqoff = rcd->head;
+       packet->numpkt = 0;
+       packet->rcv_flags = 0;
+}
+
+static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
+                       struct hfi1_other_headers *ohdr,
+                       u64 rhf, u32 bth1, struct ib_grh *grh)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       u32 rqpn = 0;
+       u16 rlid;
+       u8 sc5, svc_type;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               rlid = be16_to_cpu(hdr->lrh[3]);
+               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+               svc_type = IB_CC_SVCTYPE_UD;
+               break;
+       case IB_QPT_UC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_UC;
+               break;
+       case IB_QPT_RC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_RC;
+               break;
+       default:
+               return;
+       }
+
+       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       if (rhf_dc_info(rhf))
+               sc5 |= 0x10;
+
+       if (bth1 & HFI1_FECN_SMASK) {
+               u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+               u16 dlid = be16_to_cpu(hdr->lrh[1]);
+
+               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
+       }
+
+       if (bth1 & HFI1_BECN_SMASK) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn = bth1 & RVT_QPN_MASK;
+               u8 sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+       }
+}
+
+struct ps_mdata {
+       struct hfi1_ctxtdata *rcd;
+       u32 rsize;
+       u32 maxcnt;
+       u32 ps_head;
+       u32 ps_tail;
+       u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+                                struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+
+       mdata->rcd = rcd;
+       mdata->rsize = packet->rsize;
+       mdata->maxcnt = packet->maxcnt;
+       mdata->ps_head = packet->rhqoff;
+
+       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+               mdata->ps_tail = get_rcvhdrtail(rcd);
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       mdata->ps_seq = rcd->seq_cnt;
+               else
+                       mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+       } else {
+               mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+               mdata->ps_seq = rcd->seq_cnt;
+       }
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
+                         struct hfi1_ctxtdata *rcd)
+{
+       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+               return mdata->ps_head == mdata->ps_tail;
+       return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
+                         struct hfi1_ctxtdata *rcd)
+{
+       /*
+        * Control context can potentially receive an invalid rhf.
+        * Drop such packets.
+        */
+       if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
+               return mdata->ps_seq != rhf_rcv_seq(rhf);
+
+       return 0;
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata,
+                                  struct hfi1_ctxtdata *rcd)
+{
+       mdata->ps_head += mdata->rsize;
+       if (mdata->ps_head >= mdata->maxcnt)
+               mdata->ps_head = 0;
+
+       /* Control context must do seq counting */
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
+           (rcd->ctxt == HFI1_CTRL_CTXT)) {
+               if (++mdata->ps_seq > 13)
+                       mdata->ps_seq = 1;
+       }
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ * This is declared as a macro to allow quick checking of the port to avoid
+ * the overhead of a function call if not enabled.
+ */
+#define prescan_rxq(rcd, packet) \
+       do { \
+               if (rcd->ppd->cc_prescan) \
+                       __prescan_rxq(packet); \
+       } while (0)
+static void __prescan_rxq(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct ps_mdata mdata;
+
+       init_ps_mdata(&mdata, packet);
+
+       while (1) {
+               struct hfi1_devdata *dd = rcd->dd;
+               struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+               __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
+                                        dd->rhf_offset;
+               struct rvt_qp *qp;
+               struct hfi1_ib_header *hdr;
+               struct hfi1_other_headers *ohdr;
+               struct ib_grh *grh = NULL;
+               struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+               u64 rhf = rhf_to_cpu(rhf_addr);
+               u32 etype = rhf_rcv_type(rhf), qpn, bth1;
+               int is_ecn = 0;
+               u8 lnh;
+
+               if (ps_done(&mdata, rhf, rcd))
+                       break;
+
+               if (ps_skip(&mdata, rhf, rcd))
+                       goto next;
+
+               if (etype != RHF_RCV_TYPE_IB)
+                       goto next;
+
+               hdr = (struct hfi1_ib_header *)
+                       hfi1_get_msgheader(dd, rhf_addr);
+               lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+               if (lnh == HFI1_LRH_BTH) {
+                       ohdr = &hdr->u.oth;
+               } else if (lnh == HFI1_LRH_GRH) {
+                       ohdr = &hdr->u.l.oth;
+                       grh = &hdr->u.l.grh;
+               } else {
+                       goto next; /* just in case */
+               }
+               bth1 = be32_to_cpu(ohdr->bth[1]);
+               is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
+
+               if (!is_ecn)
+                       goto next;
+
+               qpn = bth1 & RVT_QPN_MASK;
+               rcu_read_lock();
+               qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
+
+               if (!qp) {
+                       rcu_read_unlock();
+                       goto next;
+               }
+
+               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
+               rcu_read_unlock();
+
+               /* turn off BECN, FECN */
+               bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK);
+               ohdr->bth[1] = cpu_to_be32(bth1);
+next:
+               update_ps_mdata(&mdata, rcd);
+       }
+}
+
+static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+       int ret = RCV_PKT_OK;
+
+       /* Set up for the next packet */
+       packet->rhqoff += packet->rsize;
+       if (packet->rhqoff >= packet->maxcnt)
+               packet->rhqoff = 0;
+
+       packet->numpkt++;
+       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+               if (thread) {
+                       cond_resched();
+               } else {
+                       ret = RCV_PKT_LIMIT;
+                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
+               }
+       }
+
+       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+                                    packet->rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+       return ret;
+}
+
+static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+       int ret = RCV_PKT_OK;
+
+       packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+                                        packet->rhf_addr);
+       packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+       packet->etype = rhf_rcv_type(packet->rhf);
+       /* total length */
+       packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+       /* retrieve eager buffer details */
+       packet->ebuf = NULL;
+       if (rhf_use_egr_bfr(packet->rhf)) {
+               packet->etail = rhf_egr_index(packet->rhf);
+               packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+                                &packet->updegr);
+               /*
+                * Prefetch the contents of the eager buffer.  It is
+                * OK to send a negative length to prefetch_range().
+                * The +2 is the size of the RHF.
+                */
+               prefetch_range(packet->ebuf,
+                              packet->tlen - ((packet->rcd->rcvhdrqentsize -
+                                              (rhf_hdrq_offset(packet->rhf)
+                                               + 2)) * 4));
+       }
+
+       /*
+        * Call a type specific handler for the packet. We
+        * should be able to trust that etype won't be beyond
+        * the range of valid indexes. If so something is really
+        * wrong and we can probably just let things come
+        * crashing down. There is no need to eat another
+        * comparison in this performance critical code.
+        */
+       packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+       packet->numpkt++;
+
+       /* Set up for the next packet */
+       packet->rhqoff += packet->rsize;
+       if (packet->rhqoff >= packet->maxcnt)
+               packet->rhqoff = 0;
+
+       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+               if (thread) {
+                       cond_resched();
+               } else {
+                       ret = RCV_PKT_LIMIT;
+                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
+               }
+       }
+
+       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+                                     packet->rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+       return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+       /*
+        * Update head regs etc., every 16 packets, if not last pkt,
+        * to help prevent rcvhdrq overflows, when many packets
+        * are processed and queue is nearly full.
+        * Don't request an interrupt for intermediate updates.
+        */
+       if (!last && !(packet->numpkt & 0xf)) {
+               update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+                              packet->etail, 0, 0);
+               packet->updegr = 0;
+       }
+       packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+       /*
+        * Nothing we need to free for the packet.
+        *
+        * The only thing we need to do is a final update and call for an
+        * interrupt
+        */
+       update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+                      packet->etail, rcv_intr_dynamic, packet->numpkt);
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd;
+       struct rvt_qp *qp, *nqp;
+
+       rcd = packet->rcd;
+       rcd->head = packet->rhqoff;
+
+       /*
+        * Iterate over all QPs waiting to respond.
+        * The list won't change since the IRQ is only run on one CPU.
+        */
+       list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+               list_del_init(&qp->rspwait);
+               if (qp->r_flags & RVT_R_RSP_NAK) {
+                       qp->r_flags &= ~RVT_R_RSP_NAK;
+                       hfi1_send_rc_ack(rcd, qp, 0);
+               }
+               if (qp->r_flags & RVT_R_RSP_SEND) {
+                       unsigned long flags;
+
+                       qp->r_flags &= ~RVT_R_RSP_SEND;
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       if (ib_rvt_state_ops[qp->state] &
+                                       RVT_PROCESS_OR_FLUSH_SEND)
+                               hfi1_schedule_send(qp);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+       u32 seq;
+       int last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       seq = rhf_rcv_seq(packet.rhf);
+       if (seq != rcd->seq_cnt) {
+               last = RCV_PKT_DONE;
+               goto bail;
+       }
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               last = process_rcv_packet(&packet, thread);
+               seq = rhf_rcv_seq(packet.rhf);
+               if (++rcd->seq_cnt > 13)
+                       rcd->seq_cnt = 1;
+               if (seq != rcd->seq_cnt)
+                       last = RCV_PKT_DONE;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+       return last;
+}
+
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+       u32 hdrqtail;
+       int last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       hdrqtail = get_rcvhdrtail(rcd);
+       if (packet.rhqoff == hdrqtail) {
+               last = RCV_PKT_DONE;
+               goto bail;
+       }
+       smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               last = process_rcv_packet(&packet, thread);
+               if (packet.rhqoff == hdrqtail)
+                       last = RCV_PKT_DONE;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+       return last;
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_dma_rtail;
+}
+
+void set_all_slowpath(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
+       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
+}
+
+static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
+                                     struct hfi1_packet packet,
+                                     struct hfi1_devdata *dd)
+{
+       struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
+       struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd,
+                                                            packet.rhf_addr);
+
+       if (hdr2sc(hdr, packet.rhf) != 0xf) {
+               int hwstate = read_logical_state(dd);
+
+               if (hwstate != LSTATE_ACTIVE) {
+                       dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
+                       return 0;
+               }
+
+               queue_work(rcd->ppd->hfi1_wq, lsaw);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 hdrqtail;
+       int needset, last = RCV_PKT_OK;
+       struct hfi1_packet packet;
+       int skip_pkt = 0;
+
+       /* Control context will always use the slow path interrupt handler */
+       needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
+
+       init_packet(rcd, &packet);
+
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+               u32 seq = rhf_rcv_seq(packet.rhf);
+
+               if (seq != rcd->seq_cnt) {
+                       last = RCV_PKT_DONE;
+                       goto bail;
+               }
+               hdrqtail = 0;
+       } else {
+               hdrqtail = get_rcvhdrtail(rcd);
+               if (packet.rhqoff == hdrqtail) {
+                       last = RCV_PKT_DONE;
+                       goto bail;
+               }
+               smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+               /*
+                * Control context can potentially receive an invalid
+                * rhf. Drop such packets.
+                */
+               if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                       u32 seq = rhf_rcv_seq(packet.rhf);
+
+                       if (seq != rcd->seq_cnt)
+                               skip_pkt = 1;
+               }
+       }
+
+       prescan_rxq(rcd, &packet);
+
+       while (last == RCV_PKT_OK) {
+               if (unlikely(dd->do_drop &&
+                            atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+                            DROP_PACKET_ON)) {
+                       dd->do_drop = 0;
+
+                       /* On to the next packet */
+                       packet.rhqoff += packet.rsize;
+                       packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
+                                         packet.rhqoff +
+                                         dd->rhf_offset;
+                       packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+               } else if (skip_pkt) {
+                       last = skip_rcv_packet(&packet, thread);
+                       skip_pkt = 0;
+               } else {
+                       /* Auto activate link on non-SC15 packet receive */
+                       if (unlikely(rcd->ppd->host_link_state ==
+                                    HLS_UP_ARMED) &&
+                           set_armed_to_active(rcd, packet, dd))
+                               goto bail;
+                       last = process_rcv_packet(&packet, thread);
+               }
+
+               if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+                       u32 seq = rhf_rcv_seq(packet.rhf);
+
+                       if (++rcd->seq_cnt > 13)
+                               rcd->seq_cnt = 1;
+                       if (seq != rcd->seq_cnt)
+                               last = RCV_PKT_DONE;
+                       if (needset) {
+                               dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
+                               set_all_nodma_rtail(dd);
+                               needset = 0;
+                       }
+               } else {
+                       if (packet.rhqoff == hdrqtail)
+                               last = RCV_PKT_DONE;
+                       /*
+                        * Control context can potentially receive an invalid
+                        * rhf. Drop such packets.
+                        */
+                       if (rcd->ctxt == HFI1_CTRL_CTXT) {
+                               u32 seq = rhf_rcv_seq(packet.rhf);
+
+                               if (++rcd->seq_cnt > 13)
+                                       rcd->seq_cnt = 1;
+                               if (!last && (seq != rcd->seq_cnt))
+                                       skip_pkt = 1;
+                       }
+
+                       if (needset) {
+                               dd_dev_info(dd,
+                                           "Switching to DMA_RTAIL\n");
+                               set_all_dma_rtail(dd);
+                               needset = 0;
+                       }
+               }
+
+               process_rcv_update(last, &packet);
+       }
+
+       process_rcv_qp_work(&packet);
+
+bail:
+       /*
+        * Always write head at end, and setup rcv interrupt, even
+        * if no packets were processed.
+        */
+       finish_packet(&packet);
+       return last;
+}
+
+/*
+ * We may discover in the interrupt that the hardware link state has
+ * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
+ * and we need to update the driver's notion of the link state.  We cannot
+ * run set_link_state from interrupt context, so we queue this function on
+ * a workqueue.
+ *
+ * We delay the regular interrupt processing until after the state changes
+ * so that the link will be in the correct state by the time any application
+ * we wake up attempts to send a reply to any message it received.
+ * (Subsequent receive interrupts may possibly force the wakeup before we
+ * update the link state.)
+ *
+ * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
+ * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
+ * so we're safe from use-after-free of the rcd.
+ */
+void receive_interrupt_work(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                 linkstate_active_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       int i;
+
+       /* Received non-SC15 packet implies neighbor_normal */
+       ppd->neighbor_normal = 1;
+       set_link_state(ppd, HLS_UP_ACTIVE);
+
+       /*
+        * Interrupt all kernel contexts that could have had an
+        * interrupt during auto activation.
+        */
+       for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
+               force_recv_intr(dd->rcd[i]);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+       switch (mtu) {
+       case     0: return OPA_MTU_0;
+       case   256: return OPA_MTU_256;
+       case   512: return OPA_MTU_512;
+       case  1024: return OPA_MTU_1024;
+       case  2048: return OPA_MTU_2048;
+       case  4096: return OPA_MTU_4096;
+       case  8192: return OPA_MTU_8192;
+       case 10240: return OPA_MTU_10240;
+       }
+       return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_0:     return 0;
+       case OPA_MTU_256:   return 256;
+       case OPA_MTU_512:   return 512;
+       case OPA_MTU_1024:  return 1024;
+       case OPA_MTU_2048:  return 2048;
+       case OPA_MTU_4096:  return 4096;
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default: return 0xffff;
+       }
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.  We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int i, drain, ret = 0, is_up = 0;
+
+       ppd->ibmtu = 0;
+       for (i = 0; i < ppd->vls_supported; i++)
+               if (ppd->ibmtu < dd->vld[i].mtu)
+                       ppd->ibmtu = dd->vld[i].mtu;
+       ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+       mutex_lock(&ppd->hls_lock);
+       if (ppd->host_link_state == HLS_UP_INIT ||
+           ppd->host_link_state == HLS_UP_ARMED ||
+           ppd->host_link_state == HLS_UP_ACTIVE)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * MTU is specified per-VL. To ensure that no packet gets
+                * stuck (due, e.g., to the MTU for the packet's VL being
+                * reduced), empty the per-VL FIFOs before adjusting MTU.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+                          __func__);
+               goto err;
+       }
+
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       ppd->lid = lid;
+       ppd->lmc = lmc;
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+       dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
+
+       return 0;
+}
+
+void shutdown_led_override(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       if (atomic_read(&ppd->led_override_timer_active)) {
+               del_timer_sync(&ppd->led_override_timer);
+               atomic_set(&ppd->led_override_timer_active, 0);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
+       }
+
+       /* Hand control of the LED to the DC for normal operation */
+       write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+}
+
+static void run_led_override(unsigned long opaque)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+       struct hfi1_devdata *dd = ppd->dd;
+       unsigned long timeout;
+       int phase_idx;
+
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+
+       phase_idx = ppd->led_override_phase & 1;
+
+       setextled(dd, phase_idx);
+
+       timeout = ppd->led_override_vals[phase_idx];
+
+       /* Set up for next phase */
+       ppd->led_override_phase = !ppd->led_override_phase;
+
+       mod_timer(&ppd->led_override_timer, jiffies + timeout);
+}
+
+/*
+ * To have the LED blink in a particular pattern, provide timeon and timeoff
+ * in milliseconds.
+ * To turn off custom blinking and return to normal operation, use
+ * shutdown_led_override()
+ */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+                            unsigned int timeoff)
+{
+       if (!(ppd->dd->flags & HFI1_INITTED))
+               return;
+
+       /* Convert to jiffies for direct use in timer */
+       ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
+       ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
+
+       /* Arbitrarily start from LED on phase */
+       ppd->led_override_phase = 1;
+
+       /*
+        * If the timer has not already been started, do so. Use a "quick"
+        * timeout so the handler will be called soon to look at our request.
+        */
+       if (!timer_pending(&ppd->led_override_timer)) {
+               setup_timer(&ppd->led_override_timer, run_led_override,
+                           (unsigned long)ppd);
+               ppd->led_override_timer.expires = jiffies + 1;
+               add_timer(&ppd->led_override_timer);
+               atomic_set(&ppd->led_override_timer_active, 1);
+               /* Ensure the atomic_set is visible to all CPUs */
+               smp_wmb();
+       }
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+       int ret, i;
+       struct hfi1_devdata *dd = hfi1_lookup(unit);
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+       if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+               dd_dev_info(dd,
+                           "Invalid unit number %u or not initialized or not present\n",
+                           unit);
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (dd->rcd)
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+                               continue;
+                       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+                       ret = -EBUSY;
+                       goto bail;
+               }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               shutdown_led_override(ppd);
+       }
+       if (dd->flags & HFI1_HAS_SEND_DMA)
+               sdma_exit(dd);
+
+       hfi1_reset_cpu_counters(dd);
+
+       ret = hfi1_init(dd, 1);
+
+       if (ret)
+               dd_dev_err(dd,
+                          "Reinitialize unit %u after reset failed with %d\n",
+                          unit, ret);
+       else
+               dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+                           unit);
+
+bail:
+       return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+
+       rcv_hdrerr(rcd, rcd->ppd, packet);
+       if (rhf_err_flags(packet->rhf))
+               dd_dev_err(rcd->dd,
+                          "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+                          rcd->ctxt, packet->rhf,
+                          packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+                          packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+                          packet->rhf & RHF_DC_ERR ? "dc " : "",
+                          packet->rhf & RHF_TID_ERR ? "tid " : "",
+                          packet->rhf & RHF_LEN_ERR ? "len " : "",
+                          packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+                          packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+                          packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+                          rte);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+       trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+                         packet->rcd->ctxt,
+                         rhf_err_flags(packet->rhf),
+                         RHF_RCV_TYPE_IB,
+                         packet->hlen,
+                         packet->tlen,
+                         packet->updegr,
+                         rhf_egr_index(packet->rhf));
+
+       if (unlikely(rhf_err_flags(packet->rhf))) {
+               handle_eflags(packet);
+               return RHF_RCV_CONTINUE;
+       }
+
+       hfi1_ib_rcv(packet);
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Bypass packets are not supported in normal operation. Dropping\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+       handle_eflags(packet);
+
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               dd_dev_err(packet->rcd->dd,
+                          "Unhandled error packet received. Dropping.\n");
+
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled expected packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled eager packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+       dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+                  rhf_rcv_type(packet->rhf));
+       return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
new file mode 100644 (file)
index 0000000..106349f
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "efivar.h"
+
+/* GUID for HFI1 variables in EFI */
+#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
+               0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
+/* largest EFI data size we expect */
+#define EFI_DATA_SIZE 4096
+
+/*
+ * Read the named EFI variable.  Return the size of the actual data in *size
+ * and a kmalloc'ed buffer in *return_data.  The caller must free the
+ * data.  It is guaranteed that *return_data will be NULL and *size = 0
+ * if this routine fails.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+static int read_efi_var(const char *name, unsigned long *size,
+                       void **return_data)
+{
+       efi_status_t status;
+       efi_char16_t *uni_name;
+       efi_guid_t guid;
+       unsigned long temp_size;
+       void *temp_buffer;
+       void *data;
+       int i;
+       int ret;
+
+       /* set failure return values */
+       *size = 0;
+       *return_data = NULL;
+
+       if (!efi_enabled(EFI_RUNTIME_SERVICES))
+               return -EOPNOTSUPP;
+
+       uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
+       temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
+
+       if (!uni_name || !temp_buffer) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       /* input: the size of the buffer */
+       temp_size = EFI_DATA_SIZE;
+
+       /* convert ASCII to unicode - it is a 1:1 mapping */
+       for (i = 0; name[i]; i++)
+               uni_name[i] = name[i];
+
+       /* need a variable for our GUID */
+       guid = HFI1_EFIVAR_GUID;
+
+       /* call into EFI runtime services */
+       status = efi.get_variable(
+                       uni_name,
+                       &guid,
+                       NULL,
+                       &temp_size,
+                       temp_buffer);
+
+       /*
+        * It would be nice to call efi_status_to_err() here, but that
+        * is in the EFIVAR_FS code and may not be compiled in.
+        * However, even that is insufficient since it does not cover
+        * EFI_BUFFER_TOO_SMALL which could be an important return.
+        * For now, just split out succces or not found.
+        */
+       ret = status == EFI_SUCCESS   ? 0 :
+             status == EFI_NOT_FOUND ? -ENOENT :
+                                       -EINVAL;
+       if (ret)
+               goto fail;
+
+       /*
+        * We have successfully read the EFI variable into our
+        * temporary buffer.  Now allocate a correctly sized
+        * buffer.
+        */
+       data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
+       if (!data) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       *size = temp_size;
+       *return_data = data;
+
+fail:
+       kfree(uni_name);
+       kfree(temp_buffer);
+
+       return ret;
+}
+
+/*
+ * Read an HFI1 EFI variable of the form:
+ *     <PCIe address>-<kind>
+ * Return an kalloc'ed array and size of the data.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+                     unsigned long *size, void **return_data)
+{
+       char name[64];
+
+       /* create a common prefix */
+       snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s",
+                pci_domain_nr(dd->pcidev->bus),
+                dd->pcidev->bus->number,
+                PCI_SLOT(dd->pcidev->devfn),
+                PCI_FUNC(dd->pcidev->devfn),
+                kind);
+
+       return read_efi_var(name, size, return_data);
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h
new file mode 100644 (file)
index 0000000..94e9e70
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_EFIVAR_H
+#define _HFI1_EFIVAR_H
+
+#include <linux/efi.h>
+
+#include "hfi.h"
+
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+                     unsigned long *size, void **return_data);
+
+#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c
new file mode 100644 (file)
index 0000000..36b7794
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+#define CMD_SHIFT 24
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2      /* full speed */
+
+/*
+ * How long to wait for the EPROM to become available, in ms.
+ * The spec 32 Mb EPROM takes around 40s to erase then write.
+ * Double it for safety.
+ */
+#define EPROM_TIMEOUT 80000 /* ms */
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* only the discrete chip has an EPROM */
+       if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+               return 0;
+
+       /*
+        * It is OK if both HFIs reset the EPROM as long as they don't
+        * do it at the same time.
+        */
+       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd,
+                          "%s: unable to acquire EPROM resource, no EPROM support\n",
+                          __func__);
+               goto done_asic;
+       }
+
+       /* reset EPROM to be sure it is in a good state */
+
+       /* set reset */
+       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+       /* clear reset, set speed */
+       write_csr(dd, ASIC_EEP_CTL_STAT,
+                 EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+       /* wake the device with command "release powerdown NoID" */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+       dd->eprom_available = true;
+       release_chip_resource(dd, CR_EPROM);
+done_asic:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h
new file mode 100644 (file)
index 0000000..d41f0b1
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd);
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
new file mode 100644 (file)
index 0000000..7a5b0e6
--- /dev/null
@@ -0,0 +1,1498 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/vmalloc.h>
+#include <linux/io.h>
+
+#include <rdma/ib.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "user_exp_rcv.h"
+#include "eprom.h"
+#include "aspm.h"
+#include "mmu_rb.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *, int);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+                        struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+                           unsigned long arg);
+
+static const struct file_operations hfi1_file_ops = {
+       .owner = THIS_MODULE,
+       .write_iter = hfi1_write_iter,
+       .open = hfi1_file_open,
+       .release = hfi1_file_close,
+       .unlocked_ioctl = hfi1_file_ioctl,
+       .poll = hfi1_poll,
+       .mmap = hfi1_file_mmap,
+       .llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+       .fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+       PIO_BUFS = 1,
+       PIO_BUFS_SOP,
+       PIO_CRED,
+       RCV_HDRQ,
+       RCV_EGRBUF,
+       UREGS,
+       EVENTS,
+       STATUS,
+       RTAIL,
+       SUBCTXT_UREGS,
+       SUBCTXT_RCV_HDRQ,
+       SUBCTXT_EGRBUF,
+       SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK   0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT  0
+#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK     0xffULL
+#define HFI1_MMAP_CTXT_SHIFT    16
+#define HFI1_MMAP_TYPE_MASK     0xfULL
+#define HFI1_MMAP_TYPE_SHIFT    24
+#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT   32
+
+#define HFI1_MMAP_MAGIC         0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val)        \
+       (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+       (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
+       (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+       HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+       HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+       HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+       HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
+
+#define dbg(fmt, ...)                          \
+       pr_info(fmt, ##__VA_ARGS__)
+
+static inline int is_valid_mmap(u64 token)
+{
+       return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+       struct hfi1_devdata *dd = container_of(inode->i_cdev,
+                                              struct hfi1_devdata,
+                                              user_cdev);
+
+       /* Just take a ref now. Not all opens result in a context assign */
+       kobject_get(&dd->kobj);
+
+       /* The real work is performed later in assign_ctxt() */
+       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
+       if (fp->private_data) /* no cpu affinity by default */
+               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+       return fp->private_data ? 0 : -ENOMEM;
+}
+
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+                           unsigned long arg)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_info uinfo;
+       struct hfi1_tid_info tinfo;
+       int ret = 0;
+       unsigned long addr;
+       int uval = 0;
+       unsigned long ul_uval = 0;
+       u16 uval16 = 0;
+
+       hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+       if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+           cmd != HFI1_IOCTL_GET_VERS &&
+           !uctxt)
+               return -EINVAL;
+
+       switch (cmd) {
+       case HFI1_IOCTL_ASSIGN_CTXT:
+               if (copy_from_user(&uinfo,
+                                  (struct hfi1_user_info __user *)arg,
+                                  sizeof(uinfo)))
+                       return -EFAULT;
+
+               ret = assign_ctxt(fp, &uinfo);
+               if (ret < 0)
+                       return ret;
+               setup_ctxt(fp);
+               if (ret)
+                       return ret;
+               ret = user_init(fp);
+               break;
+       case HFI1_IOCTL_CTXT_INFO:
+               ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
+                                   sizeof(struct hfi1_ctxt_info));
+               break;
+       case HFI1_IOCTL_USER_INFO:
+               ret = get_base_info(fp, (void __user *)(unsigned long)arg,
+                                   sizeof(struct hfi1_base_info));
+               break;
+       case HFI1_IOCTL_CREDIT_UPD:
+               if (uctxt && uctxt->sc)
+                       sc_return_credits(uctxt->sc);
+               break;
+
+       case HFI1_IOCTL_TID_UPDATE:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
+               if (!ret) {
+                       /*
+                        * Copy the number of tidlist entries we used
+                        * and the length of the buffer we registered.
+                        * These fields are adjacent in the structure so
+                        * we can copy them at the same time.
+                        */
+                       addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+                       if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                        sizeof(tinfo.tidcnt) +
+                                        sizeof(tinfo.length)))
+                               ret = -EFAULT;
+               }
+               break;
+
+       case HFI1_IOCTL_TID_FREE:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
+               break;
+
+       case HFI1_IOCTL_TID_INVAL_READ:
+               if (copy_from_user(&tinfo,
+                                  (struct hfi11_tid_info __user *)arg,
+                                  sizeof(tinfo)))
+                       return -EFAULT;
+
+               ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+               if (ret)
+                       break;
+               addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                sizeof(tinfo.tidcnt)))
+                       ret = -EFAULT;
+               break;
+
+       case HFI1_IOCTL_RECV_CTRL:
+               ret = get_user(uval, (int __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               ret = manage_rcvq(uctxt, fd->subctxt, uval);
+               break;
+
+       case HFI1_IOCTL_POLL_TYPE:
+               ret = get_user(uval, (int __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               uctxt->poll_type = (typeof(uctxt->poll_type))uval;
+               break;
+
+       case HFI1_IOCTL_ACK_EVENT:
+               ret = get_user(ul_uval, (unsigned long __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
+               break;
+
+       case HFI1_IOCTL_SET_PKEY:
+               ret = get_user(uval16, (u16 __user *)arg);
+               if (ret != 0)
+                       return -EFAULT;
+               if (HFI1_CAP_IS_USET(PKEY_CHECK))
+                       ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
+               else
+                       return -EPERM;
+               break;
+
+       case HFI1_IOCTL_CTXT_RESET: {
+               struct send_context *sc;
+               struct hfi1_devdata *dd;
+
+               if (!uctxt || !uctxt->dd || !uctxt->sc)
+                       return -EINVAL;
+
+               /*
+                * There is no protection here. User level has to
+                * guarantee that no one will be writing to the send
+                * context while it is being re-initialized.
+                * If user level breaks that guarantee, it will break
+                * it's own context and no one else's.
+                */
+               dd = uctxt->dd;
+               sc = uctxt->sc;
+               /*
+                * Wait until the interrupt handler has marked the
+                * context as halted or frozen. Report error if we time
+                * out.
+                */
+               wait_event_interruptible_timeout(
+                       sc->halt_wait, (sc->flags & SCF_HALTED),
+                       msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+               if (!(sc->flags & SCF_HALTED))
+                       return -ENOLCK;
+
+               /*
+                * If the send context was halted due to a Freeze,
+                * wait until the device has been "unfrozen" before
+                * resetting the context.
+                */
+               if (sc->flags & SCF_FROZEN) {
+                       wait_event_interruptible_timeout(
+                               dd->event_queue,
+                               !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+                               msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+                       if (dd->flags & HFI1_FROZEN)
+                               return -ENOLCK;
+
+                       if (dd->flags & HFI1_FORCED_FREEZE)
+                               /*
+                                * Don't allow context reset if we are into
+                                * forced freeze
+                                */
+                               return -ENODEV;
+
+                       sc_disable(sc);
+                       ret = sc_enable(sc);
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+                                    uctxt->ctxt);
+               } else {
+                       ret = sc_restart(sc);
+               }
+               if (!ret)
+                       sc_return_credits(sc);
+               break;
+       }
+
+       case HFI1_IOCTL_GET_VERS:
+               uval = HFI1_USER_SWVERSION;
+               if (put_user(uval, (int __user *)arg))
+                       return -EFAULT;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+       struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
+       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+       struct hfi1_user_sdma_comp_q *cq = fd->cq;
+       int ret = 0, done = 0, reqs = 0;
+       unsigned long dim = from->nr_segs;
+
+       if (!cq || !pq) {
+               ret = -EIO;
+               goto done;
+       }
+
+       if (!iter_is_iovec(from) || !dim) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+                 fd->uctxt->ctxt, fd->subctxt, dim);
+
+       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+               ret = -ENOSPC;
+               goto done;
+       }
+
+       while (dim) {
+               unsigned long count = 0;
+
+               ret = hfi1_user_sdma_process_request(
+                       kiocb->ki_filp, (struct iovec *)(from->iov + done),
+                       dim, &count);
+               if (ret)
+                       goto done;
+               dim -= count;
+               done += count;
+               reqs++;
+       }
+done:
+       return ret ? ret : reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd;
+       unsigned long flags, pfn;
+       u64 token = vma->vm_pgoff << PAGE_SHIFT,
+               memaddr = 0;
+       u8 subctxt, mapio = 0, vmf = 0, type;
+       ssize_t memlen = 0;
+       int ret = 0;
+       u16 ctxt;
+
+       if (!is_valid_mmap(token) || !uctxt ||
+           !(vma->vm_flags & VM_SHARED)) {
+               ret = -EINVAL;
+               goto done;
+       }
+       dd = uctxt->dd;
+       ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+       subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+       type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+       if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       flags = vma->vm_flags;
+
+       switch (type) {
+       case PIO_BUFS:
+       case PIO_BUFS_SOP:
+               memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+                               /* chip pio base */
+                          (uctxt->sc->hw_context * BIT(16))) +
+                               /* 64K PIO space / ctxt */
+                       (type == PIO_BUFS_SOP ?
+                               (TXE_PIO_SIZE / 2) : 0); /* sop? */
+               /*
+                * Map only the amount allocated to the context, not the
+                * entire available context's PIO space.
+                */
+               memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
+               flags &= ~VM_MAYREAD;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case PIO_CRED:
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               /*
+                * The credit return location for this context could be on the
+                * second or third page allocated for credit returns (if number
+                * of enabled contexts > 64 and 128 respectively).
+                */
+               memaddr = dd->cr_base[uctxt->numa_id].pa +
+                       (((u64)uctxt->sc->hw_free -
+                         (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               /*
+                * The driver has already allocated memory for credit
+                * returns and programmed it into the chip. Has that
+                * memory been flagged as non-cached?
+                */
+               /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+               mapio = 1;
+               break;
+       case RCV_HDRQ:
+               memaddr = uctxt->rcvhdrq_phys;
+               memlen = uctxt->rcvhdrq_size;
+               break;
+       case RCV_EGRBUF: {
+               unsigned long addr;
+               int i;
+               /*
+                * The RcvEgr buffer need to be handled differently
+                * as multiple non-contiguous pages need to be mapped
+                * into the user process.
+                */
+               memlen = uctxt->egrbufs.size;
+               if ((vma->vm_end - vma->vm_start) != memlen) {
+                       dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+                                  (vma->vm_end - vma->vm_start), memlen);
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (vma->vm_flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               vma->vm_flags &= ~VM_MAYWRITE;
+               addr = vma->vm_start;
+               for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+                       ret = remap_pfn_range(
+                               vma, addr,
+                               uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+                               uctxt->egrbufs.buffers[i].len,
+                               vma->vm_page_prot);
+                       if (ret < 0)
+                               goto done;
+                       addr += uctxt->egrbufs.buffers[i].len;
+               }
+               ret = 0;
+               goto done;
+       }
+       case UREGS:
+               /*
+                * Map only the page that contains this context's user
+                * registers.
+                */
+               memaddr = (unsigned long)
+                       (dd->physaddr + RXE_PER_CONTEXT_USER)
+                       + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+               /*
+                * TidFlow table is on the same page as the rest of the
+                * user registers.
+                */
+               memlen = PAGE_SIZE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case EVENTS:
+               /*
+                * Use the page where this context's flags are. User level
+                * knows where it's own bitmap is within the page.
+                */
+               memaddr = (unsigned long)(dd->events +
+                                         ((uctxt->ctxt - dd->first_user_ctxt) *
+                                          HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+               memlen = PAGE_SIZE;
+               /*
+                * v3.7 removes VM_RESERVED but the effect is kept by
+                * using VM_IO.
+                */
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case STATUS:
+               memaddr = kvirt_to_phys((void *)dd->status);
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               break;
+       case RTAIL:
+               if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+                       /*
+                        * If the memory allocation failed, the context alloc
+                        * also would have failed, so we would never get here
+                        */
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               memaddr = uctxt->rcvhdrqtailaddr_phys;
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               break;
+       case SUBCTXT_UREGS:
+               memaddr = (u64)uctxt->subctxt_uregbase;
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_RCV_HDRQ:
+               memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+               memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_EGRBUF:
+               memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+               memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               flags &= ~VM_MAYWRITE;
+               vmf = 1;
+               break;
+       case SDMA_COMP: {
+               struct hfi1_user_sdma_comp_q *cq = fd->cq;
+
+               if (!cq) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               memaddr = (u64)cq->comps;
+               memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if ((vma->vm_end - vma->vm_start) != memlen) {
+               hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+                         uctxt->ctxt, fd->subctxt,
+                         (vma->vm_end - vma->vm_start), memlen);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       vma->vm_flags = flags;
+       hfi1_cdbg(PROC,
+                 "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+                   ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+                   vma->vm_end - vma->vm_start, vma->vm_flags);
+       pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+       if (vmf) {
+               vma->vm_pgoff = pfn;
+               vma->vm_ops = &vm_ops;
+               ret = 0;
+       } else if (mapio) {
+               ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                        vma->vm_page_prot);
+       } else {
+               ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                     vma->vm_page_prot);
+       }
+done:
+       return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct page *page;
+
+       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       get_page(page);
+       vmf->page = page;
+
+       return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+       struct hfi1_ctxtdata *uctxt;
+       unsigned pollflag;
+
+       uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
+       if (!uctxt)
+               pollflag = POLLERR;
+       else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+               pollflag = poll_urgent(fp, pt);
+       else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+               pollflag = poll_next(fp, pt);
+       else /* invalid */
+               pollflag = POLLERR;
+
+       return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+       struct hfi1_filedata *fdata = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+       struct hfi1_devdata *dd = container_of(inode->i_cdev,
+                                              struct hfi1_devdata,
+                                              user_cdev);
+       unsigned long flags, *ev;
+
+       fp->private_data = NULL;
+
+       if (!uctxt)
+               goto done;
+
+       hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+       mutex_lock(&hfi1_mutex);
+
+       flush_wc();
+       /* drain user sdma queue */
+       hfi1_user_sdma_free_queues(fdata);
+
+       /* release the cpu */
+       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
+
+       /*
+        * Clear any left over, unhandled events so the next process that
+        * gets this context doesn't get confused.
+        */
+       ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                          HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+       *ev = 0;
+
+       if (--uctxt->cnt) {
+               uctxt->active_slaves &= ~(1 << fdata->subctxt);
+               uctxt->subpid[fdata->subctxt] = 0;
+               mutex_unlock(&hfi1_mutex);
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       /*
+        * Disable receive context and interrupt available, reset all
+        * RcvCtxtCtrl bits to default values.
+        */
+       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                    HFI1_RCVCTRL_TIDFLOW_DIS |
+                    HFI1_RCVCTRL_INTRAVAIL_DIS |
+                    HFI1_RCVCTRL_TAILUPD_DIS |
+                    HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+                    HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+                    HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+       /* Clear the context's J_KEY */
+       hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+       /*
+        * Reset context integrity checks to default.
+        * (writes to CSRs probably belong in chip.c)
+        */
+       write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+                       hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+       sc_disable(uctxt->sc);
+       uctxt->pid = 0;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       dd->rcd[uctxt->ctxt] = NULL;
+
+       hfi1_user_exp_rcv_free(fdata);
+       hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+       uctxt->rcvwait_to = 0;
+       uctxt->piowait_to = 0;
+       uctxt->rcvnowait = 0;
+       uctxt->pionowait = 0;
+       uctxt->event_flags = 0;
+
+       hfi1_stats.sps_ctxts--;
+       if (++dd->freectxts == dd->num_user_contexts)
+               aspm_enable_all(dd);
+       mutex_unlock(&hfi1_mutex);
+       hfi1_free_ctxtdata(dd, uctxt);
+done:
+       kobject_put(&dd->kobj);
+       kfree(fdata);
+       return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+       struct page *page;
+       u64 paddr = 0;
+
+       page = vmalloc_to_page(addr);
+       if (page)
+               paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+       int i_minor, ret = 0;
+       unsigned int swmajor, swminor;
+
+       swmajor = uinfo->userversion >> 16;
+       if (swmajor != HFI1_USER_SWMAJOR) {
+               ret = -ENODEV;
+               goto done;
+       }
+
+       swminor = uinfo->userversion & 0xffff;
+
+       mutex_lock(&hfi1_mutex);
+       /* First, lets check if we need to setup a shared context? */
+       if (uinfo->subctxt_cnt) {
+               struct hfi1_filedata *fd = fp->private_data;
+
+               ret = find_shared_ctxt(fp, uinfo);
+               if (ret < 0)
+                       goto done_unlock;
+               if (ret)
+                       fd->rec_cpu_num = hfi1_get_proc_affinity(
+                               fd->uctxt->dd, fd->uctxt->numa_id);
+       }
+
+       /*
+        * We execute the following block if we couldn't find a
+        * shared context or if context sharing is not required.
+        */
+       if (!ret) {
+               i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+               ret = get_user_context(fp, uinfo, i_minor);
+       }
+done_unlock:
+       mutex_unlock(&hfi1_mutex);
+done:
+       return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+                           int devno)
+{
+       struct hfi1_devdata *dd = NULL;
+       int devmax, npresent, nup;
+
+       devmax = hfi1_count_units(&npresent, &nup);
+       if (!npresent)
+               return -ENXIO;
+
+       if (!nup)
+               return -ENETDOWN;
+
+       dd = hfi1_lookup(devno);
+       if (!dd)
+               return -ENODEV;
+       else if (!dd->freectxts)
+               return -EBUSY;
+
+       return allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+                           const struct hfi1_user_info *uinfo)
+{
+       int devmax, ndev, i;
+       int ret = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+
+       devmax = hfi1_count_units(NULL, NULL);
+
+       for (ndev = 0; ndev < devmax; ndev++) {
+               struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+               if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+                       continue;
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+                       /* Skip ctxts which are not yet open */
+                       if (!uctxt || !uctxt->cnt)
+                               continue;
+                       /* Skip ctxt if it doesn't match the requested one */
+                       if (memcmp(uctxt->uuid, uinfo->uuid,
+                                  sizeof(uctxt->uuid)) ||
+                           uctxt->jkey != generate_jkey(current_uid()) ||
+                           uctxt->subctxt_id != uinfo->subctxt_id ||
+                           uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+                               continue;
+
+                       /* Verify the sharing process matches the master */
+                       if (uctxt->userversion != uinfo->userversion ||
+                           uctxt->cnt >= uctxt->subctxt_cnt) {
+                               ret = -EINVAL;
+                               goto done;
+                       }
+                       fd->uctxt = uctxt;
+                       fd->subctxt  = uctxt->cnt++;
+                       uctxt->subpid[fd->subctxt] = current->pid;
+                       uctxt->active_slaves |= 1 << fd->subctxt;
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+                        struct hfi1_user_info *uinfo)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt;
+       unsigned ctxt;
+       int ret, numa;
+
+       if (dd->flags & HFI1_FROZEN) {
+               /*
+                * Pick an error that is unique from all other errors
+                * that are returned so the user process knows that
+                * it tried to allocate while the SPC was frozen.  It
+                * it should be able to retry with success in a short
+                * while.
+                */
+               return -EIO;
+       }
+
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+               if (!dd->rcd[ctxt])
+                       break;
+
+       if (ctxt == dd->num_rcv_contexts)
+               return -EBUSY;
+
+       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
+       if (fd->rec_cpu_num != -1)
+               numa = cpu_to_node(fd->rec_cpu_num);
+       else
+               numa = numa_node_id();
+       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
+       if (!uctxt) {
+               dd_dev_err(dd,
+                          "Unable to allocate ctxtdata memory, failing open\n");
+               return -ENOMEM;
+       }
+       hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
+                 uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
+                 uctxt->numa_id);
+
+       /*
+        * Allocate and enable a PIO send context.
+        */
+       uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+                            uctxt->dd->node);
+       if (!uctxt->sc)
+               return -ENOMEM;
+
+       hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
+                 uctxt->sc->hw_context);
+       ret = sc_enable(uctxt->sc);
+       if (ret)
+               return ret;
+       /*
+        * Setup shared context resources if the user-level has requested
+        * shared contexts and this is the 'master' process.
+        * This has to be done here so the rest of the sub-contexts find the
+        * proper master.
+        */
+       if (uinfo->subctxt_cnt && !fd->subctxt) {
+               ret = init_subctxts(uctxt, uinfo);
+               /*
+                * On error, we don't need to disable and de-allocate the
+                * send context because it will be done during file close
+                */
+               if (ret)
+                       return ret;
+       }
+       uctxt->userversion = uinfo->userversion;
+       uctxt->pid = current->pid;
+       uctxt->flags = HFI1_CAP_UGET(MASK);
+       init_waitqueue_head(&uctxt->wait);
+       strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+       memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+       uctxt->jkey = generate_jkey(current_uid());
+       INIT_LIST_HEAD(&uctxt->sdma_queues);
+       spin_lock_init(&uctxt->sdma_qlock);
+       hfi1_stats.sps_ctxts++;
+       /*
+        * Disable ASPM when there are open user/PSM contexts to avoid
+        * issues with ASPM L1 exit latency
+        */
+       if (dd->freectxts-- == dd->num_user_contexts)
+               aspm_disable_all(dd);
+       fd->uctxt = uctxt;
+
+       return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+                        const struct hfi1_user_info *uinfo)
+{
+       unsigned num_subctxts;
+
+       num_subctxts = uinfo->subctxt_cnt;
+       if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
+               return -EINVAL;
+
+       uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+       uctxt->subctxt_id = uinfo->subctxt_id;
+       uctxt->active_slaves = 1;
+       uctxt->redirect_seq_cnt = 1;
+       set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+
+       return 0;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+       int ret = 0;
+       unsigned num_subctxts = uctxt->subctxt_cnt;
+
+       uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+       if (!uctxt->subctxt_uregbase) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       /* We can take the size of the RcvHdr Queue from the master */
+       uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+                                                 num_subctxts);
+       if (!uctxt->subctxt_rcvhdr_base) {
+               ret = -ENOMEM;
+               goto bail_ureg;
+       }
+
+       uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+                                               num_subctxts);
+       if (!uctxt->subctxt_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail_rhdr;
+       }
+       goto bail;
+bail_rhdr:
+       vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+       vfree(uctxt->subctxt_uregbase);
+       uctxt->subctxt_uregbase = NULL;
+bail:
+       return ret;
+}
+
+static int user_init(struct file *fp)
+{
+       unsigned int rcvctrl_ops = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+       /* make sure that the context has already been setup */
+       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+               return -EFAULT;
+
+       /* initialize poll variables... */
+       uctxt->urgent = 0;
+       uctxt->urgent_poll = 0;
+
+       /*
+        * Now enable the ctxt for receive.
+        * For chips that are set to DMA the tail register to memory
+        * when they change (and when the update bit transitions from
+        * 0 to 1.  So for those chips, we turn it off and then back on.
+        * This will (very briefly) affect any other open ctxts, but the
+        * duration is very short, and therefore isn't an issue.  We
+        * explicitly set the in-memory tail copy to 0 beforehand, so we
+        * don't have to wait to be sure the DMA update has happened
+        * (chip resets head/tail to 0 on transition to enable).
+        */
+       if (uctxt->rcvhdrtail_kvaddr)
+               clear_rcvhdrtail(uctxt);
+
+       /* Setup J_KEY before enabling the context */
+       hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+       rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+               rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+       /*
+        * Ignore the bit in the flags for now until proper
+        * support for multiple packet per rcv array entry is
+        * added.
+        */
+       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+               rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+       /*
+        * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
+        * We can't rely on the correct value to be set from prior
+        * uses of the chip or ctxt. Therefore, add the rcvctrl op
+        * for both cases.
+        */
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+       else
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
+       hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+       /* Notify any waiting slaves */
+       if (uctxt->subctxt_cnt) {
+               clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+               wake_up(&uctxt->wait);
+       }
+
+       return 0;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_ctxt_info cinfo;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       int ret = 0;
+
+       memset(&cinfo, 0, sizeof(cinfo));
+       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
+       if (ret < 0)
+               goto done;
+       cinfo.num_active = hfi1_count_active_units();
+       cinfo.unit = uctxt->dd->unit;
+       cinfo.ctxt = uctxt->ctxt;
+       cinfo.subctxt = fd->subctxt;
+       cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+                               uctxt->dd->rcv_entries.group_size) +
+               uctxt->expected_count;
+       cinfo.credits = uctxt->sc->credits;
+       cinfo.numa_node = uctxt->numa_id;
+       cinfo.rec_cpu = fd->rec_cpu_num;
+       cinfo.send_ctxt = uctxt->sc->hw_context;
+
+       cinfo.egrtids = uctxt->egrbufs.alloced;
+       cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+       cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+       cinfo.sdma_ring_size = fd->cq->nentries;
+       cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+       trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
+       if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+               ret = -EFAULT;
+done:
+       return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       int ret = 0;
+
+       /*
+        * Context should be set up only once, including allocation and
+        * programming of eager buffers. This is done if context sharing
+        * is not requested or by the master process.
+        */
+       if (!uctxt->subctxt_cnt || !fd->subctxt) {
+               ret = hfi1_init_ctxt(uctxt->sc);
+               if (ret)
+                       goto done;
+
+               /* Now allocate the RcvHdr queue and eager buffers. */
+               ret = hfi1_create_rcvhdrq(dd, uctxt);
+               if (ret)
+                       goto done;
+               ret = hfi1_setup_eagerbufs(uctxt);
+               if (ret)
+                       goto done;
+               if (uctxt->subctxt_cnt && !fd->subctxt) {
+                       ret = setup_subctxt(uctxt);
+                       if (ret)
+                               goto done;
+               }
+       } else {
+               ret = wait_event_interruptible(uctxt->wait, !test_bit(
+                                              HFI1_CTXT_MASTER_UNINIT,
+                                              &uctxt->event_flags));
+               if (ret)
+                       goto done;
+       }
+
+       ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+       if (ret)
+               goto done;
+       /*
+        * Expected receive has to be setup for all processes (including
+        * shared contexts). However, it has to be done after the master
+        * context has been fully configured as it depends on the
+        * eager/expected split of the RcvArray entries.
+        * Setting it up here ensures that the subcontexts will be waiting
+        * (due to the above wait_event_interruptible() until the master
+        * is setup.
+        */
+       ret = hfi1_user_exp_rcv_init(fp);
+       if (ret)
+               goto done;
+
+       set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+       return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_base_info binfo;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       ssize_t sz;
+       unsigned offset;
+       int ret = 0;
+
+       trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+       memset(&binfo, 0, sizeof(binfo));
+       binfo.hw_version = dd->revision;
+       binfo.sw_version = HFI1_KERN_SWVERSION;
+       binfo.bthqp = kdeth_qp;
+       binfo.jkey = uctxt->jkey;
+       /*
+        * If more than 64 contexts are enabled the allocated credit
+        * return will span two or three contiguous pages. Since we only
+        * map the page containing the context's credit return address,
+        * we need to calculate the offset in the proper page.
+        */
+       offset = ((u64)uctxt->sc->hw_free -
+                 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+       binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+                                               fd->subctxt, offset);
+       binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+                                           fd->subctxt,
+                                           uctxt->sc->base_addr);
+       binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+                                               uctxt->ctxt,
+                                               fd->subctxt,
+                                               uctxt->sc->base_addr);
+       binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+                                              fd->subctxt,
+                                              uctxt->rcvhdrq);
+       binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+                                              fd->subctxt,
+                                              uctxt->egrbufs.rcvtids[0].phys);
+       binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+                                                fd->subctxt, 0);
+       /*
+        * user regs are at
+        * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+        */
+       binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+                                           fd->subctxt, 0);
+       offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
+                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
+                 sizeof(*dd->events));
+       binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+                                             fd->subctxt,
+                                             offset);
+       binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+                                             fd->subctxt,
+                                             dd->status);
+       if (HFI1_CAP_IS_USET(DMA_RTAIL))
+               binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+                                                      fd->subctxt, 0);
+       if (uctxt->subctxt_cnt) {
+               binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+                                                       uctxt->ctxt,
+                                                       fd->subctxt, 0);
+               binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+                                                        uctxt->ctxt,
+                                                        fd->subctxt, 0);
+               binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+                                                        uctxt->ctxt,
+                                                        fd->subctxt, 0);
+       }
+       sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+       if (copy_to_user(ubase, &binfo, sz))
+               ret = -EFAULT;
+       return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+                               struct poll_table_struct *pt)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (uctxt->urgent != uctxt->urgent_poll) {
+               pollflag = POLLIN | POLLRDNORM;
+               uctxt->urgent_poll = uctxt->urgent;
+       } else {
+               pollflag = 0;
+               set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+       }
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+                             struct poll_table_struct *pt)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (hdrqempty(uctxt)) {
+               set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+               pollflag = 0;
+       } else {
+               pollflag = POLLIN | POLLRDNORM;
+       }
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+       struct hfi1_ctxtdata *uctxt;
+       struct hfi1_devdata *dd = ppd->dd;
+       unsigned ctxt;
+       int ret = 0;
+       unsigned long flags;
+
+       if (!dd->events) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+            ctxt++) {
+               uctxt = dd->rcd[ctxt];
+               if (uctxt) {
+                       unsigned long *evs = dd->events +
+                               (uctxt->ctxt - dd->first_user_ctxt) *
+                               HFI1_MAX_SHARED_CTXTS;
+                       int i;
+                       /*
+                        * subctxt_cnt is 0 if not shared, so do base
+                        * separately, first, then remaining subctxt, if any
+                        */
+                       set_bit(evtbit, evs);
+                       for (i = 1; i < uctxt->subctxt_cnt; i++)
+                               set_bit(evtbit, evs + i);
+               }
+       }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+       return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                      int start_stop)
+{
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned int rcvctrl_op;
+
+       if (subctxt)
+               goto bail;
+       /* atomically clear receive enable ctxt. */
+       if (start_stop) {
+               /*
+                * On enable, force in-memory copy of the tail register to
+                * 0, so that protocol code doesn't have to worry about
+                * whether or not the chip has yet updated the in-memory
+                * copy or not on return from the system call. The chip
+                * always resets it's tail register back to 0 on a
+                * transition from disabled to enabled.
+                */
+               if (uctxt->rcvhdrtail_kvaddr)
+                       clear_rcvhdrtail(uctxt);
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+       } else {
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+       }
+       hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+       /* always; new head should be equal to new tail; see above */
+bail:
+       return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+                         unsigned long events)
+{
+       int i;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned long *evs;
+
+       if (!dd->events)
+               return 0;
+
+       evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                           HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+       for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+               if (!test_bit(i, &events))
+                       continue;
+               clear_bit(i, evs);
+       }
+       return 0;
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                        u16 pkey)
+{
+       int ret = -ENOENT, i, intable = 0;
+       struct hfi1_pportdata *ppd = uctxt->ppd;
+       struct hfi1_devdata *dd = uctxt->dd;
+
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+               if (pkey == ppd->pkeys[i]) {
+                       intable = 1;
+                       break;
+               }
+
+       if (intable)
+               ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+       return ret;
+}
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+
+       hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+       char name[10];
+       int ret;
+
+       snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+       ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
+                            &dd->user_cdev, &dd->user_device,
+                            true, &dd->kobj);
+       if (ret)
+               user_remove(dd);
+
+       return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+       return user_add(dd);
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+       user_remove(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
new file mode 100644 (file)
index 0000000..ed680fd
--- /dev/null
@@ -0,0 +1,2056 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
+#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
+#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
+#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+
+/*
+ * Access required in platform.c
+ * Maintains state of whether the platform config was fetched via the
+ * fallback option
+ */
+uint platform_config_load;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+       (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+        ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+       u32 module_type;
+       u32 header_len;
+       u32 header_version;
+       u32 module_id;
+       u32 module_vendor;
+       u32 date;               /* BCD yyyymmdd */
+       u32 size;               /* in DWORDs */
+       u32 key_size;           /* in DWORDs */
+       u32 modulus_size;       /* in DWORDs */
+       u32 exponent_size;      /* in DWORDs */
+       u32 reserved[22];
+};
+
+/* expected field values */
+#define CSS_MODULE_TYPE           0x00000006
+#define CSS_HEADER_LEN    0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR  0x00008086
+
+#define KEY_SIZE      256
+#define MU_SIZE                8
+#define EXPONENT_SIZE  4
+
+/* the file itself */
+struct firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 firmware[];
+};
+
+struct augmented_firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 r2[KEY_SIZE];
+       u8 mu[MU_SIZE];
+       u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+                                               sizeof(struct firmware_file))
+
+struct firmware_details {
+       /* Linux core piece */
+       const struct firmware *fw;
+
+       struct css_header *css_header;
+       u8 *firmware_ptr;               /* pointer to binary data */
+       u32 firmware_len;               /* length in bytes */
+       u8 *modulus;                    /* pointer to the modulus */
+       u8 *exponent;                   /* pointer to the exponent */
+       u8 *signature;                  /* pointer to the signature */
+       u8 *r2;                         /* pointer to r2 */
+       u8 *mu;                         /* pointer to mu */
+       struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+       FW_EMPTY,
+       FW_TRY,
+       FW_FINAL,
+       FW_ERR
+};
+
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS   0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT  0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE   0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE   0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 10 /* ms */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+       { 0x01, 0x02, 0x03, 0x04 },
+       { 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+         0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+       { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+         0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+         0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+       { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+         0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+                                      struct firmware_details *fdet);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset.  Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+       u64 reg;
+       int count;
+
+       /* start the read at the given address */
+       reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* wait until ACCESS_COMPLETED is set */
+       count = 0;
+       while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+               count++;
+               if (count > DC8051_ACCESS_TIMEOUT) {
+                       dd_dev_err(dd, "timeout reading 8051 data\n");
+                       return -ENXIO;
+               }
+               ndelay(10);
+       }
+
+       /* gather the data */
+       *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+       return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+       unsigned long flags;
+       u32 done;
+       int ret = 0;
+
+       spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+       /* data read set-up, no auto-increment */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       for (done = 0; done < len; addr += 8, done += 8, result++) {
+               ret = __read_8051_data(dd, addr, result);
+               if (ret)
+                       break;
+       }
+
+       /* turn off read enable */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+       spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+       return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+                     const u8 *data, u32 len)
+{
+       u64 reg;
+       u32 offset;
+       int aligned, count;
+
+       /* check alignment */
+       aligned = ((unsigned long)data & 0x7) == 0;
+
+       /* write set-up */
+       reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+               | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+       reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* write */
+       for (offset = 0; offset < len; offset += 8) {
+               int bytes = len - offset;
+
+               if (bytes < 8) {
+                       reg = 0;
+                       memcpy(&reg, &data[offset], bytes);
+               } else if (aligned) {
+                       reg = *(u64 *)&data[offset];
+               } else {
+                       memcpy(&reg, &data[offset], 8);
+               }
+               write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+               /* wait until ACCESS_COMPLETED is set */
+               count = 0;
+               while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+                       count++;
+                       if (count > DC8051_ACCESS_TIMEOUT) {
+                               dd_dev_err(dd, "timeout writing 8051 data\n");
+                               return -ENXIO;
+                       }
+                       udelay(1);
+               }
+       }
+
+       /* turn off write access, auto increment (also sets to data access) */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+                         u32 actual, u32 expected)
+{
+       if (actual == expected)
+               return 0;
+
+       dd_dev_err(dd,
+                  "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+                  what, expected, actual);
+       return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+       /* verify CSS header fields (most sizes are in DW, so add /4) */
+       if (invalid_header(dd, "module_type", css->module_type,
+                          CSS_MODULE_TYPE) ||
+           invalid_header(dd, "header_len", css->header_len,
+                          (sizeof(struct firmware_file) / 4)) ||
+           invalid_header(dd, "header_version", css->header_version,
+                          CSS_HEADER_VERSION) ||
+           invalid_header(dd, "module_vendor", css->module_vendor,
+                          CSS_MODULE_VENDOR) ||
+           invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
+           invalid_header(dd, "modulus_size", css->modulus_size,
+                          KEY_SIZE / 4) ||
+           invalid_header(dd, "exponent_size", css->exponent_size,
+                          EXPONENT_SIZE / 4)) {
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+                        long file_size, long prefix_size)
+{
+       /* make sure we have some payload */
+       if (prefix_size >= file_size) {
+               dd_dev_err(dd,
+                          "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+                          name, file_size, prefix_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Request the firmware from the system.  Extract the pieces and fill in
+ * fdet.  If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+                              struct firmware_details *fdet)
+{
+       struct css_header *css;
+       int ret;
+
+       memset(fdet, 0, sizeof(*fdet));
+
+       ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+       if (ret) {
+               dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
+                           name, ret);
+               return ret;
+       }
+
+       /* verify the firmware */
+       if (fdet->fw->size < sizeof(struct css_header)) {
+               dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+               ret = -EINVAL;
+               goto done;
+       }
+       css = (struct css_header *)fdet->fw->data;
+
+       hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+       hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+       hfi1_cdbg(FIRMWARE, "CSS structure:");
+       hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
+       hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
+                 css->header_len, 4 * css->header_len);
+       hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
+       hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
+       hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
+       hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
+       hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
+                 css->size, 4 * css->size);
+       hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
+                 css->key_size, 4 * css->key_size);
+       hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
+                 css->modulus_size, 4 * css->modulus_size);
+       hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
+                 css->exponent_size, 4 * css->exponent_size);
+       hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+                 fdet->fw->size - sizeof(struct firmware_file));
+
+       /*
+        * If the file does not have a valid CSS header, fail.
+        * Otherwise, check the CSS size field for an expected size.
+        * The augmented file has r2 and mu inserted after the header
+        * was generated, so there will be a known difference between
+        * the CSS header size and the actual file size.  Use this
+        * difference to identify an augmented file.
+        *
+        * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+        */
+       ret = verify_css_header(dd, css);
+       if (ret) {
+               dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+       } else if ((css->size * 4) == fdet->fw->size) {
+               /* non-augmented firmware file */
+               struct firmware_file *ff = (struct firmware_file *)
+                                                       fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                   sizeof(struct firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = ff->modulus;
+                       fdet->exponent = ff->exponent;
+                       fdet->signature = ff->signature;
+                       fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+                       fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+                       fdet->firmware_ptr = ff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                               sizeof(struct firmware_file);
+                       /*
+                        * Header does not include r2 and mu - generate here.
+                        * For now, fail.
+                        */
+                       dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+                       ret = -EINVAL;
+               }
+       } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
+               /* augmented firmware file */
+               struct augmented_firmware_file *aff =
+                       (struct augmented_firmware_file *)fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                   sizeof(struct augmented_firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = aff->modulus;
+                       fdet->exponent = aff->exponent;
+                       fdet->signature = aff->signature;
+                       fdet->r2 = aff->r2;
+                       fdet->mu = aff->mu;
+                       fdet->firmware_ptr = aff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                       sizeof(struct augmented_firmware_file);
+               }
+       } else {
+               /* css->size check failed */
+               dd_dev_err(dd,
+                          "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+                          fdet->fw->size / 4,
+                          (fdet->fw->size - AUGMENT_SIZE) / 4,
+                          css->size);
+
+               ret = -EINVAL;
+       }
+
+done:
+       /* if returning an error, clean up after ourselves */
+       if (ret)
+               dispose_one_firmware(fdet);
+       return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+       release_firmware(fdet->fw);
+       /* erase all previous information */
+       memset(fdet, 0, sizeof(*fdet));
+}
+
+/*
+ * Obtain the 4 firmwares from the OS.  All must be obtained at once or not
+ * at all.  If called with the firmware state in FW_TRY, use alternate names.
+ * On exit, this routine will have set the firmware state to one of FW_TRY,
+ * FW_FINAL, or FW_ERR.
+ *
+ * Must be holding fw_mutex.
+ */
+static void __obtain_firmware(struct hfi1_devdata *dd)
+{
+       int err = 0;
+
+       if (fw_state == FW_FINAL)       /* nothing more to obtain */
+               return;
+       if (fw_state == FW_ERR)         /* already in error */
+               return;
+
+       /* fw_state is FW_EMPTY or FW_TRY */
+retry:
+       if (fw_state == FW_TRY) {
+               /*
+                * We tried the original and it failed.  Move to the
+                * alternate.
+                */
+               dd_dev_warn(dd, "using alternate firmware names\n");
+               /*
+                * Let others run.  Some systems, when missing firmware, does
+                * something that holds for 30 seconds.  If we do that twice
+                * in a row it triggers task blocked warning.
+                */
+               cond_resched();
+               if (fw_8051_load)
+                       dispose_one_firmware(&fw_8051);
+               if (fw_fabric_serdes_load)
+                       dispose_one_firmware(&fw_fabric);
+               if (fw_sbus_load)
+                       dispose_one_firmware(&fw_sbus);
+               if (fw_pcie_serdes_load)
+                       dispose_one_firmware(&fw_pcie);
+               fw_8051_name = ALT_FW_8051_NAME_ASIC;
+               fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
+               fw_sbus_name = ALT_FW_SBUS_NAME;
+               fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
+       }
+
+       if (fw_sbus_load) {
+               err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_fabric_serdes_load) {
+               err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+                                         &fw_fabric);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_8051_load) {
+               err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+               if (err)
+                       goto done;
+       }
+
+done:
+       if (err) {
+               /* oops, had problems obtaining a firmware */
+               if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
+                       /* retry with alternate (RTL only) */
+                       fw_state = FW_TRY;
+                       goto retry;
+               }
+               dd_dev_err(dd, "unable to obtain working firmware\n");
+               fw_state = FW_ERR;
+               fw_err = -ENOENT;
+       } else {
+               /* success */
+               if (fw_state == FW_EMPTY &&
+                   dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+                       fw_state = FW_TRY;      /* may retry later */
+               else
+                       fw_state = FW_FINAL;    /* cannot try again */
+       }
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load.  Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+       unsigned long timeout;
+       int err = 0;
+
+       mutex_lock(&fw_mutex);
+
+       /* 40s delay due to long delay on missing firmware on some systems */
+       timeout = jiffies + msecs_to_jiffies(40000);
+       while (fw_state == FW_TRY) {
+               /*
+                * Another device is trying the firmware.  Wait until it
+                * decides what works (or not).
+                */
+               if (time_after(jiffies, timeout)) {
+                       /* waited too long */
+                       dd_dev_err(dd, "Timeout waiting for firmware try");
+                       fw_state = FW_ERR;
+                       fw_err = -ETIMEDOUT;
+                       break;
+               }
+               mutex_unlock(&fw_mutex);
+               msleep(20);     /* arbitrary delay */
+               mutex_lock(&fw_mutex);
+       }
+       /* not in FW_TRY state */
+
+       if (fw_state == FW_FINAL) {
+               if (platform_config) {
+                       dd->platform_config.data = platform_config->data;
+                       dd->platform_config.size = platform_config->size;
+               }
+               goto done;      /* already acquired */
+       } else if (fw_state == FW_ERR) {
+               goto done;      /* already tried and failed */
+       }
+       /* fw_state is FW_EMPTY */
+
+       /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
+       __obtain_firmware(dd);
+
+       if (platform_config_load) {
+               platform_config = NULL;
+               err = request_firmware(&platform_config, platform_config_name,
+                                      &dd->pcidev->dev);
+               if (err) {
+                       platform_config = NULL;
+                       goto done;
+               }
+               dd->platform_config.data = platform_config->data;
+               dd->platform_config.size = platform_config->size;
+       }
+
+done:
+       mutex_unlock(&fw_mutex);
+
+       return fw_err;
+}
+
+/*
+ * Called when the driver unloads.  The timing is asymmetric with its
+ * counterpart, obtain_firmware().  If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed.  The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+       dispose_one_firmware(&fw_8051);
+       dispose_one_firmware(&fw_fabric);
+       dispose_one_firmware(&fw_pcie);
+       dispose_one_firmware(&fw_sbus);
+
+       release_firmware(platform_config);
+       platform_config = NULL;
+
+       /* retain the error state, otherwise revert to empty */
+       if (fw_state != FW_ERR)
+               fw_state = FW_EMPTY;
+}
+
+/*
+ * Called with the result of a firmware download.
+ *
+ * Return 1 to retry loading the firmware, 0 to stop.
+ */
+static int retry_firmware(struct hfi1_devdata *dd, int load_result)
+{
+       int retry;
+
+       mutex_lock(&fw_mutex);
+
+       if (load_result == 0) {
+               /*
+                * The load succeeded, so expect all others to do the same.
+                * Do not retry again.
+                */
+               if (fw_state == FW_TRY)
+                       fw_state = FW_FINAL;
+               retry = 0;      /* do NOT retry */
+       } else if (fw_state == FW_TRY) {
+               /* load failed, obtain alternate firmware */
+               __obtain_firmware(dd);
+               retry = (fw_state == FW_FINAL);
+       } else {
+               /* else in FW_FINAL or FW_ERR, no retry in either case */
+               retry = 0;
+       }
+
+       mutex_unlock(&fw_mutex);
+       return retry;
+}
+
+/*
+ * Write a block of data to a given array CSR.  All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+                          const u8 *data, int nbytes)
+{
+       int qw_size = nbytes / 8;
+       int i;
+
+       if (((unsigned long)data & 0x7) == 0) {
+               /* aligned */
+               u64 *ptr = (u64 *)data;
+
+               for (i = 0; i < qw_size; i++, ptr++)
+                       write_csr(dd, what + (8 * i), *ptr);
+       } else {
+               /* not aligned */
+               for (i = 0; i < qw_size; i++, data += 8) {
+                       u64 value;
+
+                       memcpy(&value, data, 8);
+                       write_csr(dd, what + (8 * i), value);
+               }
+       }
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes.  All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+                                   const u8 *data, int nbytes)
+{
+       u64 *ptr = (u64 *)data;
+       int qw_size = nbytes / 8;
+
+       for (; qw_size > 0; qw_size--, ptr++)
+               write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism.  Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+                  const u8 *signature)
+{
+       unsigned long timeout;
+       u64 reg;
+       u32 status;
+       int ret = 0;
+
+       /* write the signature */
+       write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+       /* initialize RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+       /*
+        * Make sure the engine is idle and insert a delay between the two
+        * writes to MISC_CFG_RSA_CMD.
+        */
+       status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+       if (status != RSA_STATUS_IDLE) {
+               dd_dev_err(dd, "%s security engine not idle - giving up\n",
+                          who);
+               return -EBUSY;
+       }
+
+       /* start RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+       /*
+        * Look for the result.
+        *
+        * The RSA engine is hooked up to two MISC errors.  The driver
+        * masks these errors as they do not respond to the standard
+        * error "clear down" mechanism.  Look for these errors here and
+        * clear them when possible.  This routine will exit with the
+        * errors of the current run still set.
+        *
+        * MISC_FW_AUTH_FAILED_ERR
+        *      Firmware authorization failed.  This can be cleared by
+        *      re-initializing the RSA engine, then clearing the status bit.
+        *      Do not re-init the RSA angine immediately after a successful
+        *      run - this will reset the current authorization.
+        *
+        * MISC_KEY_MISMATCH_ERR
+        *      Key does not match.  The only way to clear this is to load
+        *      a matching key then clear the status bit.  If this error
+        *      is raised, it will persist outside of this routine until a
+        *      matching key is loaded.
+        */
+       timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+       while (1) {
+               status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+               if (status == RSA_STATUS_IDLE) {
+                       /* should not happen */
+                       dd_dev_err(dd, "%s firmware security bad idle state\n",
+                                  who);
+                       ret = -EINVAL;
+                       break;
+               } else if (status == RSA_STATUS_DONE) {
+                       /* finished successfully */
+                       break;
+               } else if (status == RSA_STATUS_FAILED) {
+                       /* finished unsuccessfully */
+                       ret = -EINVAL;
+                       break;
+               }
+               /* else still active */
+
+               if (time_after(jiffies, timeout)) {
+                       /*
+                        * Timed out while active.  We can't reset the engine
+                        * if it is stuck active, but run through the
+                        * error code to see what error bits are set.
+                        */
+                       dd_dev_err(dd, "%s firmware security time out\n", who);
+                       ret = -ETIMEDOUT;
+                       break;
+               }
+
+               msleep(20);
+       }
+
+       /*
+        * Arrive here on success or failure.  Clear all RSA engine
+        * errors.  All current errors will stick - the RSA logic is keeping
+        * error high.  All previous errors will clear - the RSA logic
+        * is not keeping the error high.
+        */
+       write_csr(dd, MISC_ERR_CLEAR,
+                 MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
+                 MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+       /*
+        * All that is left are the current errors.  Print warnings on
+        * authorization failure details, if any.  Firmware authorization
+        * can be retried, so these are only warnings.
+        */
+       reg = read_csr(dd, MISC_ERR_STATUS);
+       if (ret) {
+               if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+                       dd_dev_warn(dd, "%s firmware authorization failed\n",
+                                   who);
+               if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+                       dd_dev_warn(dd, "%s firmware key mismatch\n", who);
+       }
+
+       return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+                                   struct firmware_details *fdet)
+{
+       /* Security variables a.  Write the modulus */
+       write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+       /* Security variables b.  Write the r2 */
+       write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+       /* Security variables c.  Write the mu */
+       write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+       /* Security variables d.  Write the header */
+       write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+                               (u8 *)fdet->css_header,
+                               sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+       u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+       return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+       unsigned long timeout;
+
+       /* in the simulator, the fake 8051 is always ready */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return 0;
+
+       timeout = msecs_to_jiffies(mstimeout) + jiffies;
+       while (1) {
+               if (get_firmware_state(dd) == 0xa0)     /* ready */
+                       return 0;
+               if (time_after(jiffies, timeout))       /* timed out */
+                       return -ETIMEDOUT;
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       u64 reg;
+       int ret;
+       u8 ver_a, ver_b;
+
+       /*
+        * DC Reset sequence
+        * Load DC 8051 firmware
+        */
+       /*
+        * DC reset step 1: Reset DC8051
+        */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK
+               | DC_DC8051_CFG_RST_CRAM_SMASK
+               | DC_DC8051_CFG_RST_DRAM_SMASK
+               | DC_DC8051_CFG_RST_IRAM_SMASK
+               | DC_DC8051_CFG_RST_SFR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /*
+        * DC reset step 2 (optional): Load 8051 data memory with link
+        * configuration
+        */
+
+       /*
+        * DC reset step 3: Load DC8051 firmware
+        */
+       /* release all but the core reset */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /* Firmware load step 1 */
+       load_security_variables(dd, fdet);
+
+       /*
+        * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+       /* Firmware load steps 3-5 */
+       ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+                        fdet->firmware_len);
+       if (ret)
+               return ret;
+
+       /*
+        * DC reset step 4. Host starts the DC8051 firmware
+        */
+       /*
+        * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+       /* Firmware load steps 7-10 */
+       ret = run_rsa(dd, "8051", fdet->signature);
+       if (ret)
+               return ret;
+
+       /* clear all reset bits, releasing the 8051 */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+       /*
+        * DC reset step 5. Wait for firmware to be ready to accept host
+        * requests.
+        */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) { /* timed out */
+               dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+                          get_firmware_state(dd));
+               return -ETIMEDOUT;
+       }
+
+       read_misc_status(dd, &ver_a, &ver_b);
+       dd_dev_info(dd, "8051 firmware version %d.%d\n",
+                   (int)ver_b, (int)ver_a);
+       dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+       return 0;
+}
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+                 ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
+                 ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
+                 ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
+                 ((u64)receiver_addr <<
+                  ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ *   when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+       /* only needed on A0 */
+       if (!is_ax(dd))
+               return;
+
+       dd_dev_info(dd, "Turning off spicos:%s%s\n",
+                   flags & SPICO_SBUS ? " SBus" : "",
+                   flags & SPICO_FABRIC ? " fabric" : "");
+
+       write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+       /* disable SBus spico */
+       if (flags & SPICO_SBUS)
+               sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+                            WRITE_SBUS_RECEIVER, 0x00000040);
+
+       /* disable the fabric serdes spicos */
+       if (flags & SPICO_FABRIC)
+               sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+                            0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ * Reset all of the fabric serdes for this HFI in preparation to take the
+ * link to Polling.
+ *
+ * To do a reset, we need to write to to the serdes registers.  Unfortunately,
+ * the fabric serdes download to the other HFI on the ASIC will have turned
+ * off the firmware validation on this HFI.  This means we can't write to the
+ * registers to reset the serdes.  Work around this by performing a complete
+ * re-download and validation of the fabric serdes firmware.  This, as a
+ * by-product, will reset the serdes.  NOTE: the re-download requires that
+ * the 8051 be in the Offline state.  I.e. not actively trying to use the
+ * serdes.  This routine is called at the point where the link is Offline and
+ * is getting ready to go to Polling.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       if (!fw_fabric_serdes_load)
+               return;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
+               return;
+       }
+       set_sbus_fast_mode(dd);
+
+       if (is_ax(dd)) {
+               /* A0 serdes do not work with a re-download */
+               u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+               /* place SerDes in reset and disable SPICO */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+               /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+               udelay(1);
+               /* remove SerDes reset */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+               /* turn SPICO enable on */
+               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       } else {
+               turn_off_spicos(dd, SPICO_FABRIC);
+               /*
+                * No need for firmware retry - what to download has already
+                * been decided.
+                * No need to pay attention to the load return - the only
+                * failure is a validation failure, which has already been
+                * checked by the initial download.
+                */
+               (void)load_fabric_serdes_firmware(dd, &fw_fabric);
+       }
+
+       clear_sbus_fast_mode(dd);
+       release_chip_resource(dd, CR_SBUS);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       u64 reg, count = 0;
+
+       /* make sure fast mode is clear */
+       clear_sbus_fast_mode(dd);
+
+       sbus_request(dd, receiver_addr, data_addr, command, data_in);
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+       /* Wait for both DONE and RCV_DATA_VALID to go high */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+               if (count++ >= SBUS_MAX_POLL_COUNT) {
+                       u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+                       /*
+                        * If the loop has timed out, we are OK if DONE bit
+                        * is set and RCV_DATA_VALID and EXECUTE counters
+                        * are the same. If not, we cannot proceed.
+                        */
+                       if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                           (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+                            SBUS_COUNTER(counts, EXECUTE)))
+                               break;
+                       return -ETIMEDOUT;
+               }
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       count = 0;
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+       /* Wait for DONE to clear after EXECUTE is cleared */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       return -ETIME;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+                                      struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+       dd_dev_info(dd, "Downloading fabric firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SerDes in reset and disable SPICO */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+       udelay(1);
+       /* step 3:  remove SerDes reset */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+       /* step 4: assert IMEM override */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+       /* step 5: download SerDes machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: IMEM override off */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "fabric serdes", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: turn SPICO enable on */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       /* step 13: enable core hardware interrupts */
+       sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading SBus firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SPICO into reset and enable off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+       /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+       /* step 4: set starting IMEM address for burst download */
+       sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+       /* step 5: download the SBus Master machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: set IMEM_CNTL_EN off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "SBus", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: set SPICO_ENABLE on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+       return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+                                    struct firmware_details *fdet)
+{
+       int i;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: assert single step (halts the SBus Master spico) */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+       /* step 3: enable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+       /* step 4: load firmware into SBus Master XDMEM */
+       /*
+        * NOTE: the dmem address, write_en, and wdata are all pre-packed,
+        * we only need to pick up the bytes and write them
+        */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+                            *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 5: disable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+       /* step 6: allow SBus Spico to run */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       /*
+        * steps 7-11: run RSA, if it succeeds, firmware is available to
+        * be swapped
+        */
+       return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+                                const u8 *addrs, int count)
+{
+       while (--count >= 0) {
+               /*
+                * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+                * defaults for everything else.  Do not read-modify-write,
+                * per instruction from the manufacturer.
+                *
+                * Register 0xfd:
+                *      bits    what
+                *      -----   ---------------------------------
+                *        0     IGNORE_BROADCAST  (default 0)
+                *      11:4    BROADCAST_GROUP_1 (default 0xff)
+                *      23:16   BROADCAST_GROUP_2 (default 0xff)
+                */
+               sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+                            (u32)bg1 << 4 | (u32)bg2 << 16);
+       }
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+       unsigned long timeout;
+       int try = 0;
+       u8 mask = 1 << dd->hfi1_id;
+       u8 user;
+
+retry:
+       timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+       while (1) {
+               write_csr(dd, ASIC_CFG_MUTEX, mask);
+               user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+               if (user == mask)
+                       return 0; /* success */
+               if (time_after(jiffies, timeout))
+                       break; /* timed out */
+               msleep(20);
+       }
+
+       /* timed out */
+       dd_dev_err(dd,
+                  "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+                  (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+       if (try == 0) {
+               /* break mutex and retry */
+               write_csr(dd, ASIC_CFG_MUTEX, 0);
+               try++;
+               goto retry;
+       }
+
+       return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+/* return the given resource bit(s) as a mask for the given HFI */
+static inline u64 resource_mask(u32 hfi1_id, u32 resource)
+{
+       return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
+}
+
+static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
+                                      const char *func)
+{
+       dd_dev_err(dd,
+                  "%s: hardware mutex stuck - suggest rebooting the machine\n",
+                  func);
+}
+
+/*
+ * Acquire access to a chip resource.
+ *
+ * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
+ */
+static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+       u64 scratch0, all_bits, my_bit;
+       int ret;
+
+       if (resource & CR_DYN_MASK) {
+               /* a dynamic resource is in use if either HFI has set the bit */
+               if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+                   (resource & (CR_I2C1 | CR_I2C2))) {
+                       /* discrete devices must serialize across both chains */
+                       all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+                                       resource_mask(1, CR_I2C1 | CR_I2C2);
+               } else {
+                       all_bits = resource_mask(0, resource) |
+                                               resource_mask(1, resource);
+               }
+               my_bit = resource_mask(dd->hfi1_id, resource);
+       } else {
+               /* non-dynamic resources are not split between HFIs */
+               all_bits = resource;
+               my_bit = resource;
+       }
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       ret = acquire_hw_mutex(dd);
+       if (ret) {
+               fail_mutex_acquire_message(dd, __func__);
+               ret = -EIO;
+               goto done;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if (scratch0 & all_bits) {
+               ret = -EBUSY;
+       } else {
+               write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
+               /* force write to be visible to other HFI on another OS */
+               (void)read_csr(dd, ASIC_CFG_SCRATCH);
+       }
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+       return ret;
+}
+
+/*
+ * Acquire access to a chip resource, wait up to mswait milliseconds for
+ * the resource to become available.
+ *
+ * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
+ * acquire failed.
+ */
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
+{
+       unsigned long timeout;
+       int ret;
+
+       timeout = jiffies + msecs_to_jiffies(mswait);
+       while (1) {
+               ret = __acquire_chip_resource(dd, resource);
+               if (ret != -EBUSY)
+                       return ret;
+               /* resource is busy, check our timeout */
+               if (time_after_eq(jiffies, timeout))
+                       return -EBUSY;
+               usleep_range(80, 120);  /* arbitrary delay */
+       }
+}
+
+/*
+ * Release access to a chip resource
+ */
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+       u64 scratch0, bit;
+
+       /* only dynamic resources should ever be cleared */
+       if (!(resource & CR_DYN_MASK)) {
+               dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
+                          resource);
+               return;
+       }
+       bit = resource_mask(dd->hfi1_id, resource);
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       if (acquire_hw_mutex(dd)) {
+               fail_mutex_acquire_message(dd, __func__);
+               goto done;
+       }
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if ((scratch0 & bit) != 0) {
+               scratch0 &= ~bit;
+               write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+               /* force write to be visible to other HFI on another OS */
+               (void)read_csr(dd, ASIC_CFG_SCRATCH);
+       } else {
+               dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
+                           __func__, dd->hfi1_id, resource);
+       }
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+/*
+ * Return true if resource is set, false otherwise.  Print a warning
+ * if not set and a function is supplied.
+ */
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+                        const char *func)
+{
+       u64 scratch0, bit;
+
+       if (resource & CR_DYN_MASK)
+               bit = resource_mask(dd->hfi1_id, resource);
+       else
+               bit = resource;
+
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       if ((scratch0 & bit) == 0) {
+               if (func)
+                       dd_dev_warn(dd,
+                                   "%s: id %d, resource 0x%x, not acquired!\n",
+                                   func, dd->hfi1_id, resource);
+               return false;
+       }
+       return true;
+}
+
+static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
+{
+       u64 scratch0;
+
+       /* lock against other callers within the driver wanting a resource */
+       mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+       if (acquire_hw_mutex(dd)) {
+               fail_mutex_acquire_message(dd, func);
+               goto done;
+       }
+
+       /* clear all dynamic access bits for this HFI */
+       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+       scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
+       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+       /* force write to be visible to other HFI on another OS */
+       (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+       release_hw_mutex(dd);
+
+done:
+       mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+void init_chip_resources(struct hfi1_devdata *dd)
+{
+       /* clear any holds left by us */
+       clear_chip_resources(dd, __func__);
+}
+
+void finish_chip_resources(struct hfi1_devdata *dd)
+{
+       /* clear any holds left by us */
+       clear_chip_resources(dd, __func__);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                 ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       u64 reg, count = 0;
+
+       reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       while (SBUS_COUNTER(reg, EXECUTE) !=
+              SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       break;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       }
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       if (fw_fabric_serdes_load) {
+               ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+               if (ret)
+                       return ret;
+
+               set_sbus_fast_mode(dd);
+
+               set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+                                    fabric_serdes_broadcast[dd->hfi1_id],
+                                    fabric_serdes_addrs[dd->hfi1_id],
+                                    NUM_FABRIC_SERDES);
+               turn_off_spicos(dd, SPICO_FABRIC);
+               do {
+                       ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+               } while (retry_firmware(dd, ret));
+
+               clear_sbus_fast_mode(dd);
+               release_chip_resource(dd, CR_SBUS);
+               if (ret)
+                       return ret;
+       }
+
+       if (fw_8051_load) {
+               do {
+                       ret = load_8051_firmware(dd, &fw_8051);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+       /* only RTL can use these */
+       if (dd->icode != ICODE_RTL_SILICON) {
+               fw_fabric_serdes_load = 0;
+               fw_pcie_serdes_load = 0;
+               fw_sbus_load = 0;
+       }
+
+       /* no 8051 or QSFP on simulator */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               fw_8051_load = 0;
+               platform_config_load = 0;
+       }
+
+       if (!fw_8051_name) {
+               if (dd->icode == ICODE_RTL_SILICON)
+                       fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+               else
+                       fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+       }
+       if (!fw_fabric_serdes_name)
+               fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+       if (!fw_sbus_name)
+               fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+       if (!fw_pcie_serdes_name)
+               fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+       if (!platform_config_name)
+               platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+       return obtain_firmware(dd);
+}
+
+/*
+ * This function is a helper function for parse_platform_config(...) and
+ * does not check for validity of the platform configuration cache
+ * (because we know it is invalid as we are building up the cache).
+ * As such, this should not be called from anywhere other than
+ * parse_platform_config
+ */
+static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
+{
+       u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+       if (!system_table)
+               return -EINVAL;
+
+       meta_ver_meta =
+       *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
+       + SYSTEM_TABLE_META_VERSION);
+
+       mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+       ver_start = meta_ver_meta & mask;
+
+       meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
+
+       mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+       ver_len = meta_ver_meta & mask;
+
+       ver_start /= 8;
+       meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
+
+       if (meta_ver < 5) {
+               dd_dev_info(
+                       dd, "%s:Please update platform config\n", __func__);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *ptr = NULL;
+       u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
+       u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+       int ret = -EINVAL; /* assume failure */
+
+       if (!dd->platform_config.data) {
+               dd_dev_info(dd, "%s: Missing config file\n", __func__);
+               goto bail;
+       }
+       ptr = (u32 *)dd->platform_config.data;
+
+       magic_num = *ptr;
+       ptr++;
+       if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+               dd_dev_info(dd, "%s: Bad config file\n", __func__);
+               goto bail;
+       }
+
+       /* Field is file size in DWORDs */
+       file_length = (*ptr) * 4;
+       ptr++;
+
+       if (file_length > dd->platform_config.size) {
+               dd_dev_info(dd, "%s:File claims to be larger than read size\n",
+                           __func__);
+               goto bail;
+       } else if (file_length < dd->platform_config.size) {
+               dd_dev_info(dd,
+                           "%s:File claims to be smaller than read size, continuing\n",
+                           __func__);
+       }
+       /* exactly equal, perfection */
+
+       /*
+        * In both cases where we proceed, using the self-reported file length
+        * is the safer option
+        */
+       while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
+               header1 = *ptr;
+               header2 = *(ptr + 1);
+               if (header1 != ~header2) {
+                       dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+                                   __func__, (ptr - (u32 *)
+                                              dd->platform_config.data));
+                       goto bail;
+               }
+
+               record_idx = *ptr &
+                       ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+               table_length_dwords = (*ptr >>
+                               PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+                     ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+               table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+                       ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+               /* Done with this set of headers */
+               ptr += 2;
+
+               if (record_idx) {
+                       /* data table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       1;
+                               ret = check_meta_version(dd, ptr);
+                               if (ret)
+                                       goto bail;
+                               break;
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       2;
+                               break;
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                       table_length_dwords;
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                           "%s: Unknown data table %d, offset %ld\n",
+                                           __func__, table_type,
+                                           (ptr - (u32 *)
+                                            dd->platform_config.data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table = ptr;
+               } else {
+                       /* metadata table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                           "%s: Unknown meta table %d, offset %ld\n",
+                                           __func__, table_type,
+                                           (ptr -
+                                            (u32 *)dd->platform_config.data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table_metadata =
+                                                                       ptr;
+               }
+
+               /* Calculate and check table crc */
+               crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+                              (table_length_dwords * 4));
+               crc ^= ~(u32)0;
+
+               /* Jump the table */
+               ptr += table_length_dwords;
+               if (crc != *ptr) {
+                       dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+                                   __func__, (ptr -
+                                              (u32 *)
+                                              dd->platform_config.data));
+                       goto bail;
+               }
+               /* Jump the CRC DWORD */
+               ptr++;
+       }
+
+       pcfgcache->cache_valid = 1;
+       return 0;
+bail:
+       memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+       return ret;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+                                         int field, u32 *field_len_bits,
+                                         u32 *field_start_bits)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *src_ptr = NULL;
+
+       if (!pcfgcache->cache_valid)
+               return -EINVAL;
+
+       switch (table) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               if (field && field < platform_config_table_limits[table])
+                       src_ptr =
+                       pcfgcache->config_tables[table].table_metadata + field;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr)
+               return -EINVAL;
+
+       if (field_start_bits)
+               *field_start_bits = *src_ptr &
+                     ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+       if (field_len_bits)
+               *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+                      & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+       return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+                             enum platform_config_table_type_encoding
+                             table_type, int table_index, int field_index,
+                             u32 *data, u32 len)
+{
+       int ret = 0, wlen = 0, seek = 0;
+       u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+       if (data)
+               memset(data, 0, len);
+       else
+               return -EINVAL;
+
+       ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+                                            &field_len_bits,
+                                            &field_start_bits);
+       if (ret)
+               return -EINVAL;
+
+       /* Convert length to bits */
+       len *= 8;
+
+       /* Our metadata function checked cache_valid and field_index for us */
+       switch (table_type) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+                       if (len < field_len_bits)
+                               return -EINVAL;
+
+                       seek = field_start_bits / 8;
+                       wlen = field_len_bits / 8;
+
+                       src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+                       /*
+                        * We expect the field to be byte aligned and whole byte
+                        * lengths if we are here
+                        */
+                       memcpy(data, src_ptr, wlen);
+                       return 0;
+               }
+               break;
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* Port table is 4 DWORDS */
+               src_ptr = dd->hfi1_id ?
+                       pcfgcache->config_tables[table_type].table + 4 :
+                       pcfgcache->config_tables[table_type].table;
+               break;
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (table_index <
+                       pcfgcache->config_tables[table_type].num_table)
+                       src_ptr += table_index;
+               else
+                       src_ptr = NULL;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr || len < field_len_bits)
+               return -EINVAL;
+
+       src_ptr += (field_start_bits / 32);
+       *data = (*src_ptr >> (field_start_bits % 32)) &
+                       ((1 << field_len_bits) - 1);
+
+       return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes.  An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the SBus resource.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* both firmware loads below use the SBus */
+       set_sbus_fast_mode(dd);
+
+       if (fw_sbus_load) {
+               turn_off_spicos(dd, SPICO_SBUS);
+               do {
+                       ret = load_sbus_firmware(dd, &fw_sbus);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+               set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+                                    pcie_serdes_broadcast[dd->hfi1_id],
+                                    pcie_serdes_addrs[dd->hfi1_id],
+                                    NUM_PCIE_SERDES);
+               do {
+                       ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+               } while (retry_firmware(dd, ret));
+               if (ret)
+                       goto done;
+       }
+
+done:
+       clear_sbus_fast_mode(dd);
+
+       return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+       /* Take the DC out of reset to get a valid GUID value */
+       write_csr(dd, CCE_DC_CTRL, 0);
+       (void)read_csr(dd, CCE_DC_CTRL);
+
+       dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+       dd_dev_info(dd, "GUID %llx",
+                   (unsigned long long)dd->base_guid);
+}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
new file mode 100644 (file)
index 0000000..4417a0f
--- /dev/null
@@ -0,0 +1,1950 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <rdma/rdma_vt.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform.h"
+#include "affinity.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF                0
+#define DROP_PACKET_ON         1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+       (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+                       HFI1_CAP_MISC_MASK)
+/* Offline Disabled Reason is 4-bits */
+#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
+
+/*
+ * Control context is always 0 and handles the error packets.
+ * It also handles the VL15 and multicast packets.
+ */
+#define HFI1_CTRL_CTXT    0
+
+/*
+ * Driver context will store software counters for each of the events
+ * associated with these status registers
+ */
+#define NUM_CCE_ERR_STATUS_COUNTERS 41
+#define NUM_RCV_ERR_STATUS_COUNTERS 64
+#define NUM_MISC_ERR_STATUS_COUNTERS 13
+#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
+#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
+#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
+#define NUM_SEND_ERR_STATUS_COUNTERS 3
+#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
+#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+       __u64 sps_ints; /* number of interrupts handled */
+       __u64 sps_errints; /* number of error interrupts */
+       __u64 sps_txerrs; /* tx-related packet errors */
+       __u64 sps_rcverrs; /* non-crc rcv packet errors */
+       __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+       __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+       __u64 sps_ctxts; /* number of contexts currently open */
+       __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+       __u64 sps_buffull;
+       __u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+struct ctxt_eager_bufs {
+       ssize_t size;            /* total size of eager buffers */
+       u32 count;               /* size of buffers array */
+       u32 numbufs;             /* number of buffers allocated */
+       u32 alloced;             /* number of rcvarray entries used */
+       u32 rcvtid_size;         /* size of each eager rcv tid */
+       u32 threshold;           /* head update threshold */
+       struct eager_buffer {
+               void *addr;
+               dma_addr_t phys;
+               ssize_t len;
+       } *buffers;
+       struct {
+               void *addr;
+               dma_addr_t phys;
+       } *rcvtids;
+};
+
+struct exp_tid_set {
+       struct list_head list;
+       u32 count;
+};
+
+struct hfi1_ctxtdata {
+       /* shadow the ctxt's RcvCtrl register */
+       u64 rcvctrl;
+       /* rcvhdrq base, needs mmap before useful */
+       void *rcvhdrq;
+       /* kernel virtual address where hdrqtail is updated */
+       volatile __le64 *rcvhdrtail_kvaddr;
+       /*
+        * Shared page for kernel to signal user processes that send buffers
+        * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
+        * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+        */
+       unsigned long *user_event_mask;
+       /* when waiting for rcv or pioavail */
+       wait_queue_head_t wait;
+       /* rcvhdrq size (for freeing) */
+       size_t rcvhdrq_size;
+       /* number of rcvhdrq entries */
+       u16 rcvhdrq_cnt;
+       /* size of each of the rcvhdrq entries */
+       u16 rcvhdrqentsize;
+       /* mmap of hdrq, must fit in 44 bits */
+       dma_addr_t rcvhdrq_phys;
+       dma_addr_t rcvhdrqtailaddr_phys;
+       struct ctxt_eager_bufs egrbufs;
+       /* this receive context's assigned PIO ACK send context */
+       struct send_context *sc;
+
+       /* dynamic receive available interrupt timeout */
+       u32 rcvavail_timeout;
+       /*
+        * number of opens (including slave sub-contexts) on this instance
+        * (ignoring forks, dup, etc. for now)
+        */
+       int cnt;
+       /*
+        * how much space to leave at start of eager TID entries for
+        * protocol use, on each TID
+        */
+       /* instead of calculating it */
+       unsigned ctxt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_cnt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_id;
+       u8 uuid[16];
+       /* job key */
+       u16 jkey;
+       /* number of RcvArray groups for this context. */
+       u32 rcv_array_groups;
+       /* index of first eager TID entry. */
+       u32 eager_base;
+       /* number of expected TID entries */
+       u32 expected_count;
+       /* index of first expected TID entry. */
+       u32 expected_base;
+
+       struct exp_tid_set tid_group_list;
+       struct exp_tid_set tid_used_list;
+       struct exp_tid_set tid_full_list;
+
+       /* lock protecting all Expected TID data */
+       struct mutex exp_lock;
+       /* number of pio bufs for this ctxt (all procs, if shared) */
+       u32 piocnt;
+       /* first pio buffer for this ctxt */
+       u32 pio_base;
+       /* chip offset of PIO buffers for this ctxt */
+       u32 piobufs;
+       /* per-context configuration flags */
+       u32 flags;
+       /* per-context event flags for fileops/intr communication */
+       unsigned long event_flags;
+       /* WAIT_RCV that timed out, no interrupt */
+       u32 rcvwait_to;
+       /* WAIT_PIO that timed out, no interrupt */
+       u32 piowait_to;
+       /* WAIT_RCV already happened, no wait */
+       u32 rcvnowait;
+       /* WAIT_PIO already happened, no wait */
+       u32 pionowait;
+       /* total number of polled urgent packets */
+       u32 urgent;
+       /* saved total number of polled urgent packets for poll edge trigger */
+       u32 urgent_poll;
+       /* pid of process using this ctxt */
+       pid_t pid;
+       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
+       /* same size as task_struct .comm[], command that opened context */
+       char comm[TASK_COMM_LEN];
+       /* so file ops can get at unit */
+       struct hfi1_devdata *dd;
+       /* so functions that need physical port can get it easily */
+       struct hfi1_pportdata *ppd;
+       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+       void *subctxt_uregbase;
+       /* An array of pages for the eager receive buffers * N */
+       void *subctxt_rcvegrbuf;
+       /* An array of pages for the eager header queue entries * N */
+       void *subctxt_rcvhdr_base;
+       /* The version of the library which opened this ctxt */
+       u32 userversion;
+       /* Bitmask of active slaves */
+       u32 active_slaves;
+       /* Type of packets or conditions we want to poll for */
+       u16 poll_type;
+       /* receive packet sequence counter */
+       u8 seq_cnt;
+       u8 redirect_seq_cnt;
+       /* ctxt rcvhdrq head offset */
+       u32 head;
+       u32 pkt_count;
+       /* QPs waiting for context processing */
+       struct list_head qp_wait_list;
+       /* interrupt handling */
+       u64 imask;      /* clear interrupt mask */
+       int ireg;       /* clear interrupt register */
+       unsigned numa_id; /* numa node of this context */
+       /* verbs stats per CTX */
+       struct hfi1_opcode_stats_perctx *opstats;
+       /*
+        * This is the kernel thread that will keep making
+        * progress on the user sdma requests behind the scenes.
+        * There is one per context (shared contexts use the master's).
+        */
+       struct task_struct *progress;
+       struct list_head sdma_queues;
+       /* protect sdma queues */
+       spinlock_t sdma_qlock;
+
+       /* Is ASPM interrupt supported for this context */
+       bool aspm_intr_supported;
+       /* ASPM state (enabled/disabled) for this context */
+       bool aspm_enabled;
+       /* Timer for re-enabling ASPM if interrupt activity quietens down */
+       struct timer_list aspm_timer;
+       /* Lock to serialize between intr, timer intr and user threads */
+       spinlock_t aspm_lock;
+       /* Is ASPM processing enabled for this context (in intr context) */
+       bool aspm_intr_enable;
+       /* Last interrupt timestamp */
+       ktime_t aspm_ts_last_intr;
+       /* Last timestamp at which we scheduled a timer for this context */
+       ktime_t aspm_ts_timer_sched;
+
+       /*
+        * The interrupt handler for a particular receive context can vary
+        * throughout it's lifetime. This is not a lock protected data member so
+        * it must be updated atomically and the prev and new value must always
+        * be valid. Worst case is we process an extra interrupt and up to 64
+        * packets with the wrong interrupt handler.
+        */
+       int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+       void *ebuf;
+       void *hdr;
+       struct hfi1_ctxtdata *rcd;
+       __le32 *rhf_addr;
+       struct rvt_qp *qp;
+       struct hfi1_other_headers *ohdr;
+       u64 rhf;
+       u32 maxcnt;
+       u32 rhqoff;
+       u32 hdrqtail;
+       int numpkt;
+       u16 tlen;
+       u16 hlen;
+       s16 etail;
+       u16 rsize;
+       u8 updegr;
+       u8 rcv_flags;
+       u8 etype;
+};
+
+static inline bool has_sc4_bit(struct hfi1_packet *p)
+{
+       return !!rhf_dc_info(p->rhf);
+}
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+       int mode_flag;
+       struct cdev cdev;
+       struct device *class_dev;
+       /* protect snoop data */
+       spinlock_t snoop_lock;
+       struct list_head queue;
+       wait_queue_head_t waitq;
+       void *filter_value;
+       int (*filter_callback)(void *hdr, void *data, void *value);
+       u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE     1U
+#define HFI1_PORT_CAPTURE_MODE   2U
+
+struct rvt_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in.  Used as an argument to set_link_state().  Implemented
+ * as bits for easy multi-state checking.  The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP       0
+#define __HLS_UP_ARMED_BP      1
+#define __HLS_UP_ACTIVE_BP     2
+#define __HLS_DN_DOWNDEF_BP    3       /* link down default */
+#define __HLS_DN_POLL_BP       4
+#define __HLS_DN_DISABLE_BP    5
+#define __HLS_DN_OFFLINE_BP    6
+#define __HLS_VERIFY_CAP_BP    7
+#define __HLS_GOING_UP_BP      8
+#define __HLS_GOING_OFFLINE_BP  9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT      BIT(__HLS_UP_INIT_BP)
+#define HLS_UP_ARMED     BIT(__HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE    BIT(__HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF   BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL      BIT(__HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE   BIT(__HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE   BIT(__HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP   BIT(__HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP     BIT(__HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+#define HLS_DOWN ~(HLS_UP)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 10240
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB             1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB              2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL          3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT                 4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS                5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX       6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN   0x1
+#define HFI1_PART_ENFORCE_OUT  0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL            0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH             0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED          0x2 /* Disable this counter */
+#define CNTR_32BIT             0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL                        0x8 /* Per VL counter */
+#define CNTR_SDMA              0x10
+#define CNTR_INVALID_VL                -1  /* Specifies invalid VL */
+#define CNTR_MODE_W            0x0
+#define CNTR_MODE_R            0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+       if (*cntr < (u64)-1LL)
+               (*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+       if (*cntr < (u32)-1LL)
+               (*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+       enum irq_type type;
+       struct msix_entry msix;
+       void *arg;
+       char name[MAX_NAME_SIZE];
+       cpumask_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+       struct hrtimer hrtimer;
+       struct hfi1_pportdata *ppd; /* read-only */
+       int sl; /* read-only */
+       u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+       /*
+        * SMA-facing value.  Should be set from .latest when
+        * HLS_UP_* -> HLS_DN_* transition actually occurs.
+        */
+       u8 sma;
+       u8 latest;
+};
+
+enum {
+       LO_PRIO_TABLE,
+       HI_PRIO_TABLE,
+       MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+       /* protect vl arb cache */
+       spinlock_t lock;
+       struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+       struct hfi1_ibport ibport_data;
+
+       struct hfi1_devdata *dd;
+       struct kobject pport_cc_kobj;
+       struct kobject sc2vl_kobj;
+       struct kobject sl2sc_kobj;
+       struct kobject vl2mtu_kobj;
+
+       /* PHY support */
+       u32 port_type;
+       struct qsfp_data qsfp_info;
+
+       /* GUID for this interface, in host order */
+       u64 guid;
+       /* GUID for peer interface, in host order */
+       u64 neighbor_guid;
+
+       /* up or down physical link state */
+       u32 linkup;
+
+       /*
+        * this address is mapped read-only into user processes so they can
+        * get status cheaply, whenever they want.  One qword of status per port
+        */
+       u64 *statusp;
+
+       /* SendDMA related entries */
+
+       struct workqueue_struct *hfi1_wq;
+
+       /* move out of interrupt context */
+       struct work_struct link_vc_work;
+       struct work_struct link_up_work;
+       struct work_struct link_down_work;
+       struct work_struct sma_message_work;
+       struct work_struct freeze_work;
+       struct work_struct link_downgrade_work;
+       struct work_struct link_bounce_work;
+       /* host link state variables */
+       struct mutex hls_lock;
+       u32 host_link_state;
+
+       spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
+
+       u32 lstate;     /* logical link state */
+
+       /* these are the "32 bit" regs */
+
+       u32 ibmtu; /* The MTU programmed for this unit */
+       /*
+        * Current max size IB packet (in bytes) including IB headers, that
+        * we can send. Changes when ibmtu changes.
+        */
+       u32 ibmaxlen;
+       u32 current_egress_rate; /* units [10^6 bits/sec] */
+       /* LID programmed for this instance */
+       u16 lid;
+       /* list of pkeys programmed; 0 if not set */
+       u16 pkeys[MAX_PKEY_VALUES];
+       u16 link_width_supported;
+       u16 link_width_downgrade_supported;
+       u16 link_speed_supported;
+       u16 link_width_enabled;
+       u16 link_width_downgrade_enabled;
+       u16 link_speed_enabled;
+       u16 link_width_active;
+       u16 link_width_downgrade_tx_active;
+       u16 link_width_downgrade_rx_active;
+       u16 link_speed_active;
+       u8 vls_supported;
+       u8 vls_operational;
+       u8 actual_vls_operational;
+       /* LID mask control */
+       u8 lmc;
+       /* Rx Polarity inversion (compensate for ~tx on partner) */
+       u8 rx_pol_inv;
+
+       u8 hw_pidx;     /* physical port index */
+       u8 port;        /* IB port number and index into dd->pports - 1 */
+       /* type of neighbor node */
+       u8 neighbor_type;
+       u8 neighbor_normal;
+       u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+       u8 neighbor_port_number;
+       u8 is_sm_config_started;
+       u8 offline_disabled_reason;
+       u8 is_active_optimize_enabled;
+       u8 driver_link_ready;   /* driver ready for active link */
+       u8 link_enabled;        /* link enabled? */
+       u8 linkinit_reason;
+       u8 local_tx_rate;       /* rate given to 8051 firmware */
+       u8 last_pstate;         /* info only */
+
+       /* placeholders for IB MAD packet settings */
+       u8 overrun_threshold;
+       u8 phy_error_threshold;
+
+       /* Used to override LED behavior for things like maintenance beaconing*/
+       /*
+        * Alternates per phase of blink
+        * [0] holds LED off duration, [1] holds LED on duration
+        */
+       unsigned long led_override_vals[2];
+       u8 led_override_phase; /* LSB picks from vals[] */
+       atomic_t led_override_timer_active;
+       /* Used to flash LEDs in override mode */
+       struct timer_list led_override_timer;
+
+       u32 sm_trap_qp;
+       u32 sa_qp;
+
+       /*
+        * cca_timer_lock protects access to the per-SL cca_timer
+        * structures (specifically the ccti member).
+        */
+       spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+       struct cca_timer cca_timer[OPA_MAX_SLS];
+
+       /* List of congestion control table entries */
+       struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+       /* congestion entries, each entry corresponding to a SL */
+       struct opa_congestion_setting_entry_shadow
+               congestion_entries[OPA_MAX_SLS];
+
+       /*
+        * cc_state_lock protects (write) access to the per-port
+        * struct cc_state.
+        */
+       spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+       struct cc_state __rcu *cc_state;
+
+       /* Total number of congestion control table entries */
+       u16 total_cct_entry;
+
+       /* Bit map identifying service level */
+       u32 cc_sl_control_map;
+
+       /* CA's max number of 64 entry units in the congestion control table */
+       u8 cc_max_table_entries;
+
+       /*
+        * begin congestion log related entries
+        * cc_log_lock protects all congestion log related data
+        */
+       spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+       u16 threshold_event_counter;
+       struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+       int cc_log_idx; /* index for logging events */
+       int cc_mad_idx; /* index for reporting events */
+       /* end congestion log related entries */
+
+       struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+       /* port relative counter buffer */
+       u64 *cntrs;
+       /* port relative synthetic counter buffer */
+       u64 *scntrs;
+       /* port_xmit_discards are synthesized from different egress errors */
+       u64 port_xmit_discards;
+       u64 port_xmit_discards_vl[C_VL_COUNT];
+       u64 port_xmit_constraint_errors;
+       u64 port_rcv_constraint_errors;
+       /* count of 'link_err' interrupts from DC */
+       u64 link_downed;
+       /* number of times link retrained successfully */
+       u64 link_up;
+       /* number of times a link unknown frame was reported */
+       u64 unknown_frame_count;
+       /* port_ltp_crc_mode is returned in 'portinfo' MADs */
+       u16 port_ltp_crc_mode;
+       /* port_crc_mode_enabled is the crc we support */
+       u8 port_crc_mode_enabled;
+       /* mgmt_allowed is also returned in 'portinfo' MADs */
+       u8 mgmt_allowed;
+       u8 part_enforce; /* partition enforcement flags */
+       struct link_down_reason local_link_down_reason;
+       struct link_down_reason neigh_link_down_reason;
+       /* Value to be sent to link peer on LinkDown .*/
+       u8 remote_link_down_reason;
+       /* Error events that will cause a port bounce. */
+       u32 port_error_action;
+       struct work_struct linkstate_active_work;
+       /* Does this port need to prescan for FECNs */
+       bool cc_prescan;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE  0    /* keep going */
+#define RHF_RCV_DONE     1     /* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2    /* stop. retain this packet */
+
+struct rcv_array_data {
+       u8 group_size;
+       u16 ngroups;
+       u16 nctxt_extra;
+};
+
+struct per_vl_data {
+       u16 mtu;
+       struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+       u8 status_and_code;
+       u64 packet_flit1;
+       u64 packet_flit2;
+};
+
+struct err_info_constraint {
+       u8 status;
+       u16 pkey;
+       u32 slid;
+};
+
+struct hfi1_temp {
+       unsigned int curr;       /* current temperature */
+       unsigned int lo_lim;     /* low temperature limit */
+       unsigned int hi_lim;     /* high temperature limit */
+       unsigned int crit_lim;   /* critical temperature limit */
+       u8 triggers;      /* temperature triggers */
+};
+
+/* common data between shared ASIC HFIs */
+struct hfi1_asic_data {
+       struct hfi1_devdata *dds[2];    /* back pointers */
+       struct mutex asic_resource_mutex;
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
+struct hfi1_devdata {
+       struct hfi1_ibdev verbs_dev;     /* must be first */
+       struct list_head list;
+       /* pointers to related structs for this device */
+       /* pci access data structure */
+       struct pci_dev *pcidev;
+       struct cdev user_cdev;
+       struct cdev diag_cdev;
+       struct cdev ui_cdev;
+       struct device *user_device;
+       struct device *diag_device;
+       struct device *ui_device;
+
+       /* mem-mapped pointer to base of chip regs */
+       u8 __iomem *kregbase;
+       /* end of mem-mapped chip space excluding sendbuf and user regs */
+       u8 __iomem *kregend;
+       /* physical address of chip for io_remap, etc. */
+       resource_size_t physaddr;
+       /* receive context data */
+       struct hfi1_ctxtdata **rcd;
+       /* send context data */
+       struct send_context_info *send_contexts;
+       /* map hardware send contexts to software index */
+       u8 *hw_to_sw;
+       /* spinlock for allocating and releasing send context resources */
+       spinlock_t sc_lock;
+       /* Per VL data. Enough for all VLs but not all elements are set/used. */
+       struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+       /* lock for pio_map */
+       spinlock_t pio_map_lock;
+       /* array of kernel send contexts */
+       struct send_context **kernel_send_context;
+       /* array of vl maps */
+       struct pio_vl_map __rcu *pio_map;
+       /* seqlock for sc2vl */
+       seqlock_t sc2vl_lock;
+       u64 sc2vl[4];
+       /* Send Context initialization lock. */
+       spinlock_t sc_init_lock;
+
+       /* fields common to all SDMA engines */
+
+       /* default flags to last descriptor */
+       u64 default_desc1;
+       volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_heads_phys;
+       void                               *sdma_pad_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_pad_phys;
+       /* for deallocation */
+       size_t                              sdma_heads_size;
+       /* number from the chip */
+       u32                                 chip_sdma_engines;
+       /* num used */
+       u32                                 num_sdma;
+       /* lock for sdma_map */
+       spinlock_t                          sde_map_lock;
+       /* array of engines sized by num_sdma */
+       struct sdma_engine                 *per_sdma;
+       /* array of vl maps */
+       struct sdma_vl_map __rcu           *sdma_map;
+       /* SPC freeze waitqueue and variable */
+       wait_queue_head_t                 sdma_unfreeze_wq;
+       atomic_t                          sdma_unfreeze_count;
+
+       /* common data between shared ASIC HFIs in this OS */
+       struct hfi1_asic_data *asic_data;
+
+       /* hfi1_pportdata, points to array of (physical) port-specific
+        * data structs, indexed by pidx (0..n-1)
+        */
+       struct hfi1_pportdata *pport;
+
+       /* mem-mapped pointer to base of PIO buffers */
+       void __iomem *piobase;
+       /*
+        * write-combining mem-mapped pointer to base of RcvArray
+        * memory.
+        */
+       void __iomem *rcvarray_wc;
+       /*
+        * credit return base - a per-NUMA range of DMA address that
+        * the chip will use to update the per-context free counter
+        */
+       struct credit_return_base *cr_base;
+
+       /* send context numbers and sizes for each type */
+       struct sc_config_sizes sc_sizes[SC_MAX];
+
+       u32 lcb_access_count;           /* count of LCB users */
+
+       char *boardname; /* human readable board info */
+
+       /* device (not port) flags, basically device capabilities */
+       u32 flags;
+
+       /* reset value */
+       u64 z_int_counter;
+       u64 z_rcv_limit;
+       u64 z_send_schedule;
+       /* percpu int_counter */
+       u64 __percpu *int_counter;
+       u64 __percpu *rcv_limit;
+       u64 __percpu *send_schedule;
+       /* number of receive contexts in use by the driver */
+       u32 num_rcv_contexts;
+       /* number of pio send contexts in use by the driver */
+       u32 num_send_contexts;
+       /*
+        * number of ctxts available for PSM open
+        */
+       u32 freectxts;
+       /* total number of available user/PSM contexts */
+       u32 num_user_contexts;
+       /* base receive interrupt timeout, in CSR units */
+       u32 rcv_intr_timeout_csr;
+
+       u64 __iomem *egrtidbase;
+       spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+       spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+       /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+       spinlock_t uctxt_lock; /* rcd and user context changes */
+       /* exclusive access to 8051 */
+       spinlock_t dc8051_lock;
+       /* exclusive access to 8051 memory */
+       spinlock_t dc8051_memlock;
+       int dc8051_timed_out;   /* remember if the 8051 timed out */
+       /*
+        * A page that will hold event notification bitmaps for all
+        * contexts. This page will be mapped into all processes.
+        */
+       unsigned long *events;
+       /*
+        * per unit status, see also portdata statusp
+        * mapped read-only into user processes so they can get unit and
+        * IB link status cheaply
+        */
+       struct hfi1_status *status;
+       u32 freezelen; /* max length of freezemsg */
+
+       /* revision register shadow */
+       u64 revision;
+       /* Base GUID for device (network order) */
+       u64 base_guid;
+
+       /* these are the "32 bit" regs */
+
+       /* value we put in kr_rcvhdrsize */
+       u32 rcvhdrsize;
+       /* number of receive contexts the chip supports */
+       u32 chip_rcv_contexts;
+       /* number of receive array entries */
+       u32 chip_rcv_array_count;
+       /* number of PIO send contexts the chip supports */
+       u32 chip_send_contexts;
+       /* number of bytes in the PIO memory buffer */
+       u32 chip_pio_mem_size;
+       /* number of bytes in the SDMA memory buffer */
+       u32 chip_sdma_mem_size;
+
+       /* size of each rcvegrbuffer */
+       u32 rcvegrbufsize;
+       /* log2 of above */
+       u16 rcvegrbufsize_shift;
+       /* both sides of the PCIe link are gen3 capable */
+       u8 link_gen3_capable;
+       /* localbus width (1, 2,4,8,16,32) from config space  */
+       u32 lbus_width;
+       /* localbus speed in MHz */
+       u32 lbus_speed;
+       int unit; /* unit # of this chip */
+       int node; /* home node of this chip */
+
+       /* save these PCI fields to restore after a reset */
+       u32 pcibar0;
+       u32 pcibar1;
+       u32 pci_rom;
+       u16 pci_command;
+       u16 pcie_devctl;
+       u16 pcie_lnkctl;
+       u16 pcie_devctl2;
+       u32 pci_msix0;
+       u32 pci_lnkctl3;
+       u32 pci_tph2;
+
+       /*
+        * ASCII serial number, from flash, large enough for original
+        * all digit strings, and longer serial number format
+        */
+       u8 serial[SERIAL_MAX];
+       /* human readable board version */
+       u8 boardversion[BOARD_VERS_MAX];
+       u8 lbus_info[32]; /* human readable localbus info */
+       /* chip major rev, from CceRevision */
+       u8 majrev;
+       /* chip minor rev, from CceRevision */
+       u8 minrev;
+       /* hardware ID */
+       u8 hfi1_id;
+       /* implementation code */
+       u8 icode;
+       /* default link down value (poll/sleep) */
+       u8 link_default;
+       /* vAU of this device */
+       u8 vau;
+       /* vCU of this device */
+       u8 vcu;
+       /* link credits of this device */
+       u16 link_credits;
+       /* initial vl15 credits to use */
+       u16 vl15_init;
+
+       /* Misc small ints */
+       /* Number of physical ports available */
+       u8 num_pports;
+       /* Lowest context number which can be used by user processes */
+       u8 first_user_ctxt;
+       u8 n_krcv_queues;
+       u8 qos_shift;
+       u8 qpn_mask;
+
+       u16 rhf_offset; /* offset of RHF within receive header entry */
+       u16 irev;       /* implementation revision */
+       u16 dc8051_ver; /* 8051 firmware version */
+
+       struct platform_config platform_config;
+       struct platform_config_cache pcfg_cache;
+
+       struct diag_client *diag_client;
+       spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+       u8 psxmitwait_supported;
+       /* cycle length of PS* counters in HW (in picoseconds) */
+       u16 psxmitwait_check_rate;
+       /* high volume overflow errors deferred to tasklet */
+       struct tasklet_struct error_tasklet;
+
+       /* MSI-X information */
+       struct hfi1_msix_entry *msix_entries;
+       u32 num_msix_entries;
+
+       /* INTx information */
+       u32 requested_intx_irq;         /* did we request one? */
+       char intx_name[MAX_NAME_SIZE];  /* INTx name */
+
+       /* general interrupt: mask of handled interrupts */
+       u64 gi_mask[CCE_NUM_INT_CSRS];
+
+       struct rcv_array_data rcv_entries;
+
+       /*
+        * 64 bit synthetic counters
+        */
+       struct timer_list synth_stats_timer;
+
+       /*
+        * device counters
+        */
+       char *cntrnames;
+       size_t cntrnameslen;
+       size_t ndevcntrs;
+       u64 *cntrs;
+       u64 *scntrs;
+
+       /*
+        * remembered values for synthetic counters
+        */
+       u64 last_tx;
+       u64 last_rx;
+
+       /*
+        * per-port counters
+        */
+       size_t nportcntrs;
+       char *portcntrnames;
+       size_t portcntrnameslen;
+
+       struct hfi1_snoop_data hfi1_snoop;
+
+       struct err_info_rcvport err_info_rcvport;
+       struct err_info_constraint err_info_rcv_constraint;
+       struct err_info_constraint err_info_xmit_constraint;
+       u8 err_info_uncorrectable;
+       u8 err_info_fmconfig;
+
+       atomic_t drop_packet;
+       u8 do_drop;
+
+       /*
+        * Software counters for the status bits defined by the
+        * associated error status registers
+        */
+       u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
+       u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
+       u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
+       u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
+       u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
+       u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
+       u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
+
+       /* Software counter that spans all contexts */
+       u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
+       /* Software counter that spans all DMA engines */
+       u64 sw_send_dma_eng_err_status_cnt[
+               NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
+       /* Software counter that aggregates all cce_err_status errors */
+       u64 sw_cce_err_status_aggregate;
+
+       /* receive interrupt functions */
+       rhf_rcv_function_ptr *rhf_rcv_function_map;
+       rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+       /*
+        * Handlers for outgoing data so that snoop/capture does not
+        * have to have its hooks in the send path
+        */
+       send_routine process_pio_send;
+       send_routine process_dma_send;
+       void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                               u64 pbc, const void *from, size_t count);
+
+       /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+       u8 oui1;
+       u8 oui2;
+       u8 oui3;
+       /* Timer and counter used to detect RcvBufOvflCnt changes */
+       struct timer_list rcverr_timer;
+       u32 rcv_ovfl_cnt;
+
+       wait_queue_head_t event_queue;
+
+       /* Save the enabled LCB error bits */
+       u64 lcb_err_en;
+       u8 dc_shutdown;
+
+       /* receive context tail dummy address */
+       __le64 *rcvhdrtail_dummy_kvaddr;
+       dma_addr_t rcvhdrtail_dummy_physaddr;
+
+       bool eprom_available;   /* true if EPROM is available for this device */
+       bool aspm_supported;    /* Does HW support ASPM */
+       bool aspm_enabled;      /* ASPM state: enabled/disabled */
+       /* Serialize ASPM enable/disable between multiple verbs contexts */
+       spinlock_t aspm_lock;
+       /* Number of verbs contexts which have disabled ASPM */
+       atomic_t aspm_disabled_cnt;
+
+       struct hfi1_affinity *affinity;
+       struct kobject kobj;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER    1
+#define PT_INVALID  2
+
+struct tid_rb_node;
+struct mmu_rb_node;
+
+/* Private data for file operations */
+struct hfi1_filedata {
+       struct hfi1_ctxtdata *uctxt;
+       unsigned subctxt;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       /* for cpu affinity; -1 if none */
+       int rec_cpu_num;
+       u32 tid_n_pinned;
+       struct rb_root tid_rb_root;
+       struct tid_rb_node **entry_to_rb;
+       spinlock_t tid_lock; /* protect tid_[limit,used] counters */
+       u32 tid_limit;
+       u32 tid_used;
+       u32 *invalid_tids;
+       u32 invalid_tid_idx;
+       /* protect invalid_tids array and invalid_tid_idx */
+       spinlock_t invalid_lock;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+                        struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+int handle_receive_interrupt(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
+void set_all_slowpath(struct hfi1_devdata *dd);
+
+/* receive packet handler dispositions */
+#define RCV_PKT_OK      0x0 /* keep going */
+#define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
+#define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
+
+/* calculate the current RHF address */
+static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
+{
+       return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
+}
+
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+       return ppd->lstate; /* use the cached value */
+}
+
+void receive_interrupt_work(struct work_struct *work);
+
+/* extract service channel from header and rhf */
+static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
+{
+       return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
+              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
+}
+
+static inline u16 generate_jkey(kuid_t uid)
+{
+       return from_kuid(current_user_ns(), uid) & 0xffff;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+       u16 link_speed = ppd->link_speed_active;
+       u16 link_width = ppd->link_width_active;
+       u32 egress_rate;
+
+       if (link_speed == OPA_LINK_SPEED_25G)
+               egress_rate = 25000;
+       else /* assume OPA_LINK_SPEED_12_5G */
+               egress_rate = 12500;
+
+       switch (link_width) {
+       case OPA_LINK_WIDTH_4X:
+               egress_rate *= 4;
+               break;
+       case OPA_LINK_WIDTH_3X:
+               egress_rate *= 3;
+               break;
+       case OPA_LINK_WIDTH_2X:
+               egress_rate *= 2;
+               break;
+       default:
+               /* assume IB_WIDTH_1X */
+               break;
+       }
+
+       return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+       u32 cycles;
+
+       /*
+        * cycles is:
+        *
+        *          (length) [bits] / (rate) [bits/sec]
+        *  ---------------------------------------------------
+        *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+        */
+
+       cycles = len * 8; /* bits */
+       cycles *= 805;
+       cycles /= rate;
+
+       return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+       /* Pause at least 1us, to ensure chip returns all credits */
+       u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+       udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+       unsigned seq;
+       u8 rval;
+
+       if (sc5 >= OPA_MAX_SCS)
+               return (u8)(0xff);
+
+       do {
+               seq = read_seqbegin(&dd->sc2vl_lock);
+               rval = *(((u8 *)dd->sc2vl) + sc5);
+       } while (read_seqretry(&dd->sc2vl_lock, seq));
+
+       return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 ment = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == ment) {
+               /*
+                * If pkey[15] is clear (limited partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (!(pkey & PKEY_MEMBER_MASK))
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+       int i;
+
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
+       return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+                                   u16 slid)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       incr_cntr64(&ppd->port_rcv_constraint_errors);
+       if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+               dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+               dd->err_info_rcv_constraint.slid = slid;
+               dd->err_info_rcv_constraint.pkey = pkey;
+       }
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                    u8 sc5, u8 idx, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /* The most likely matching pkey has index 'idx' */
+       if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+               return 0;
+
+       /* no match - try the whole table */
+       if (!ingress_pkey_table_search(ppd, pkey))
+               return 0;
+
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                u8 sc5, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       return 0;
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0     0
+#define OPA_MTU_256   1
+#define OPA_MTU_512   2
+#define OPA_MTU_1024  3
+#define OPA_MTU_2048  4
+#define OPA_MTU_4096  5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+       return mtu == 256 || mtu == 512 ||
+               mtu == 1024 || mtu == 2048 ||
+               mtu == 4096;
+}
+
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+       return mtu >= 2048 &&
+               (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                          u64 pbc);
+int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                          u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                          u64 pbc, const void *from, size_t count);
+int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+       return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+       return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+       return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+       return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
+{
+       return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+       WARN_ON(pidx >= dd->num_pports);
+       return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 ret;
+
+       if (index >= ARRAY_SIZE(ppd->pkeys))
+               ret = 0;
+       else
+               ret = ppd->pkeys[index];
+
+       return ret;
+}
+
+/*
+ * Readers of cc_state must call get_cc_state() under rcu_read_lock().
+ * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+       return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED           0x1    /* chip and driver up and initted */
+#define HFI1_PRESENT           0x2    /* chip accesses can be done */
+#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT  0x8
+#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+/* ctxt_flag bit offsets */
+               /* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+               /* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV   2
+               /* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+               /* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+                                 const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+void cc_state_reclaim(struct rcu_head *rcu);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/* LED beaconing functions */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+                            unsigned int timeoff);
+void shutdown_led_override(struct hfi1_pportdata *ppd);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field.  If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines.  The typical local KDETH header would
+ * be:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      12     BHT
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      40     GRH (optional)
+ *      12     BTH
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes.  Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
+int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
+void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       /*
+        * volatile because it's a DMA target from the chip, routine is
+        * inlined, and don't want register caching or reordering.
+        */
+       return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+                    const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+                             enum platform_config_table_type_encoding
+                             table_type, int table_index, int field_index,
+                             u32 *data, u32 len);
+
+const char *get_unit_name(int unit);
+const char *get_card_name(struct rvt_dev_info *rdi);
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+       asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct rvt_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern int num_user_contexts;
+extern unsigned n_krcvqs;
+extern uint krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME            "hfi1"
+#define HFI1_USER_MINOR_BASE     0
+#define HFI1_TRACE_MINOR         127
+#define HFI1_DIAGPKT_MINOR       128
+#define HFI1_DIAG_MINOR_BASE     129
+#define HFI1_SNOOP_CAPTURE_BASE  200
+#define HFI1_NMINORS             255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY                                         \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK           \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY                                       \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+                                                 u16 ctxt_type)
+{
+       u64 base_sc_integrity =
+       SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (ctxt_type == SC_USER)
+               base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+       else
+               base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+       if (is_ax(dd))
+               /* turn off send-side job key checks - A0 */
+               return base_sc_integrity &
+                      ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+       u64 base_sdma_integrity =
+       SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (is_ax(dd))
+               /* turn off send-side job key checks - A0 */
+               return base_sdma_integrity &
+                      ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+       dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+       dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+       dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+                 get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+       dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+       dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+       dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_dbg(dd, fmt, ...) \
+       dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
+               get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
+                       get_unit_name((dd)->unit), (port), ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+       u64 mask;
+       const char *msg;
+       size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+                         const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+       dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+       dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rvp.z_rc_acks =
+                       get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
+               ppd->ibport_data.rvp.z_rc_qacks =
+                       get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
+       }
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+       if (on)
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+       else
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+/* return the i2c resource given the target */
+static inline u32 i2c_target(u32 target)
+{
+       return target ? CR_I2C2 : CR_I2C1;
+}
+
+/* return the i2c chain chip resource that this HFI uses for QSFP */
+static inline u32 qsfp_resource(struct hfi1_devdata *dd)
+{
+       return i2c_target(dd->hfi1_id);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
new file mode 100644 (file)
index 0000000..5cc492e
--- /dev/null
@@ -0,0 +1,1818 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+#include <rdma/rdma_vt.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+#include "aspm.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
+ */
+int num_user_contexts = -1;
+module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+       num_user_contexts, "Set max number of user contexts to use");
+
+uint krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33;        /* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       int ret;
+
+       /* Control context has to be always 0 */
+       BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
+
+       dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
+                              GFP_KERNEL, dd->node);
+       if (!dd->rcd)
+               goto nomem;
+
+       /* create one or more kernel contexts */
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               struct hfi1_pportdata *ppd;
+               struct hfi1_ctxtdata *rcd;
+
+               ppd = dd->pport + (i % dd->num_pports);
+               rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
+               if (!rcd) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate kernel receive context, failing\n");
+                       goto nomem;
+               }
+               /*
+                * Set up the kernel context flags here and now because they
+                * use default values for all receive side memories.  User
+                * contexts will be handled as they are created.
+                */
+               rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+                       HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+                       HFI1_CAP_KGET(NODROP_EGR_FULL) |
+                       HFI1_CAP_KGET(DMA_RTAIL);
+
+               /* Control context must use DMA_RTAIL */
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       rcd->flags |= HFI1_CAP_DMA_RTAIL;
+               rcd->seq_cnt = 1;
+
+               rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+               if (!rcd->sc) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate kernel send context, failing\n");
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       goto nomem;
+               }
+
+               ret = hfi1_init_ctxt(rcd->sc);
+               if (ret < 0) {
+                       dd_dev_err(dd,
+                                  "Failed to setup kernel receive context, failing\n");
+                       sc_free(rcd->sc);
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       }
+
+       /*
+        * Initialize aspm, to be done after gen3 transition and setting up
+        * contexts and before enabling interrupts
+        */
+       aspm_init(dd);
+
+       return 0;
+nomem:
+       ret = -ENOMEM;
+bail:
+       kfree(dd->rcd);
+       dd->rcd = NULL;
+       return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+                                          int numa)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct hfi1_ctxtdata *rcd;
+       unsigned kctxt_ngroups = 0;
+       u32 base;
+
+       if (dd->rcv_entries.nctxt_extra >
+           dd->num_rcv_contexts - dd->first_user_ctxt)
+               kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+                                (dd->num_rcv_contexts - dd->first_user_ctxt));
+       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+       if (rcd) {
+               u32 rcvtids, max_entries;
+
+               hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
+
+               INIT_LIST_HEAD(&rcd->qp_wait_list);
+               rcd->ppd = ppd;
+               rcd->dd = dd;
+               rcd->cnt = 1;
+               rcd->ctxt = ctxt;
+               dd->rcd[ctxt] = rcd;
+               rcd->numa_id = numa;
+               rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+               mutex_init(&rcd->exp_lock);
+
+               /*
+                * Calculate the context's RcvArray entry starting point.
+                * We do this here because we have to take into account all
+                * the RcvArray entries that previous context would have
+                * taken and we have to account for any extra groups
+                * assigned to the kernel or user contexts.
+                */
+               if (ctxt < dd->first_user_ctxt) {
+                       if (ctxt < kctxt_ngroups) {
+                               base = ctxt * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base = kctxt_ngroups +
+                                       (ctxt * dd->rcv_entries.ngroups);
+               } else {
+                       u16 ct = ctxt - dd->first_user_ctxt;
+
+                       base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+                               kctxt_ngroups);
+                       if (ct < dd->rcv_entries.nctxt_extra) {
+                               base += ct * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base += dd->rcv_entries.nctxt_extra +
+                                       (ct * dd->rcv_entries.ngroups);
+               }
+               rcd->eager_base = base * dd->rcv_entries.group_size;
+
+               /* Validate and initialize Rcv Hdr Q variables */
+               if (rcvhdrcnt % HDRQ_INCREMENT) {
+                       dd_dev_err(dd,
+                                  "ctxt%u: header queue count %d must be divisible by %lu\n",
+                                  rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+                       goto bail;
+               }
+               rcd->rcvhdrq_cnt = rcvhdrcnt;
+               rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+               /*
+                * Simple Eager buffer allocation: we have already pre-allocated
+                * the number of RcvArray entry groups. Each ctxtdata structure
+                * holds the number of groups for that context.
+                *
+                * To follow CSR requirements and maintain cacheline alignment,
+                * make sure all sizes and bases are multiples of group_size.
+                *
+                * The expected entry count is what is left after assigning
+                * eager.
+                */
+               max_entries = rcd->rcv_array_groups *
+                       dd->rcv_entries.group_size;
+               rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+               rcd->egrbufs.count = round_down(rcvtids,
+                                               dd->rcv_entries.group_size);
+               if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+                       dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+                                  rcd->ctxt);
+                       rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+               }
+               hfi1_cdbg(PROC,
+                         "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+                         rcd->ctxt, rcd->egrbufs.count);
+
+               /*
+                * Allocate array that will hold the eager buffer accounting
+                * data.
+                * This will allocate the maximum possible buffer count based
+                * on the value of the RcvArray split parameter.
+                * The resulting value will be rounded down to the closest
+                * multiple of dd->rcv_entries.group_size.
+                */
+               rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
+                                              sizeof(*rcd->egrbufs.buffers),
+                                              GFP_KERNEL);
+               if (!rcd->egrbufs.buffers)
+                       goto bail;
+               rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
+                                              sizeof(*rcd->egrbufs.rcvtids),
+                                              GFP_KERNEL);
+               if (!rcd->egrbufs.rcvtids)
+                       goto bail;
+               rcd->egrbufs.size = eager_buffer_size;
+               /*
+                * The size of the buffers programmed into the RcvArray
+                * entries needs to be big enough to handle the highest
+                * MTU supported.
+                */
+               if (rcd->egrbufs.size < hfi1_max_mtu) {
+                       rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+                       hfi1_cdbg(PROC,
+                                 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+                                   rcd->ctxt, rcd->egrbufs.size);
+               }
+               rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+               if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+                       rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+                               GFP_KERNEL);
+                       if (!rcd->opstats)
+                               goto bail;
+               }
+       }
+       return rcd;
+bail:
+       kfree(rcd->egrbufs.rcvtids);
+       kfree(rcd->egrbufs.buffers);
+       kfree(rcd);
+       return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+       /* there are only 3 valid receive header entry sizes */
+       if (size == 2)
+               return 1;
+       if (size == 16)
+               return 2;
+       else if (size == 32)
+               return 4;
+       return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct cc_state *cc_state;
+       int i;
+       u16 cce, ccti_limit, max_ccti = 0;
+       u16 shift, mult;
+       u64 src;
+       u32 current_egress_rate; /* Mbits /sec */
+       u32 max_pkt_time;
+       /*
+        * max_pkt_time is the maximum packet egress time in units
+        * of the fabric clock period 1/(805 MHz).
+        */
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state)
+               /*
+                * This should _never_ happen - rcu_read_lock() is held,
+                * and set_link_ipg() should not be called if cc_state
+                * is NULL.
+                */
+               return;
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               u16 ccti = ppd->cca_timer[i].ccti;
+
+               if (ccti > max_ccti)
+                       max_ccti = ccti;
+       }
+
+       ccti_limit = cc_state->cct.ccti_limit;
+       if (max_ccti > ccti_limit)
+               max_ccti = ccti_limit;
+
+       cce = cc_state->cct.entries[max_ccti].entry;
+       shift = (cce & 0xc000) >> 14;
+       mult = (cce & 0x3fff);
+
+       current_egress_rate = active_egress_rate(ppd);
+
+       max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+       src = (max_pkt_time >> shift) * mult;
+
+       src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+       src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+       struct cca_timer *cca_timer;
+       struct hfi1_pportdata *ppd;
+       int sl;
+       u16 ccti_timer, ccti_min;
+       struct cc_state *cc_state;
+       unsigned long flags;
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+       cca_timer = container_of(t, struct cca_timer, hrtimer);
+       ppd = cca_timer->ppd;
+       sl = cca_timer->sl;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return HRTIMER_NORESTART;
+       }
+
+       /*
+        * 1) decrement ccti for SL
+        * 2) calculate IPG for link (set_link_ipg())
+        * 3) restart timer, unless ccti is at min value
+        */
+
+       ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+       if (cca_timer->ccti > ccti_min) {
+               cca_timer->ccti--;
+               set_link_ipg(ppd);
+       }
+
+       if (cca_timer->ccti > ccti_min) {
+               unsigned long nsec = 1024 * ccti_timer;
+               /* ccti_timer is in units of 1.024 usec */
+               hrtimer_forward_now(t, ns_to_ktime(nsec));
+               ret = HRTIMER_RESTART;
+       }
+
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+       rcu_read_unlock();
+       return ret;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+       int i, size;
+       uint default_pkey_idx;
+
+       ppd->dd = dd;
+       ppd->hw_pidx = hw_pidx;
+       ppd->port = port; /* IB port number, not index */
+
+       default_pkey_idx = 1;
+
+       ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+       if (loopback) {
+               hfi1_early_err(&pdev->dev,
+                              "Faking data partition 0x8001 in idx %u\n",
+                              !default_pkey_idx);
+               ppd->pkeys[!default_pkey_idx] = 0x8001;
+       }
+
+       INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+       INIT_WORK(&ppd->link_up_work, handle_link_up);
+       INIT_WORK(&ppd->link_down_work, handle_link_down);
+       INIT_WORK(&ppd->freeze_work, handle_freeze);
+       INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+       INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+       INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+       INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
+       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+       mutex_init(&ppd->hls_lock);
+       spin_lock_init(&ppd->sdma_alllock);
+       spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+       ppd->qsfp_info.ppd = ppd;
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+
+       ppd->hfi1_wq = NULL;
+
+       spin_lock_init(&ppd->cca_timer_lock);
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+                            HRTIMER_MODE_REL);
+               ppd->cca_timer[i].ppd = ppd;
+               ppd->cca_timer[i].sl = i;
+               ppd->cca_timer[i].ccti = 0;
+               ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+       }
+
+       ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+       spin_lock_init(&ppd->cc_state_lock);
+       spin_lock_init(&ppd->cc_log_lock);
+       size = sizeof(struct cc_state);
+       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
+       if (!rcu_dereference(ppd->cc_state))
+               goto bail;
+       return;
+
+bail:
+
+       hfi1_early_err(&pdev->dev,
+                      "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+       return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Ensure chip does no sends or receives, tail updates, or
+        * pioavail updates while we re-initialize.  This is mostly
+        * for the driver data structures, not chip registers.
+        */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                                 HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                 HFI1_RCVCTRL_TAILUPD_DIS, i);
+       pio_send_control(dd, PSC_GLOBAL_DISABLE);
+       for (i = 0; i < dd->num_send_contexts; i++)
+               sc_disable(dd->send_contexts[i].sc);
+
+       return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+       u32 rcvmask;
+       u32 i;
+
+       /* enable PIO send */
+       pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+       /*
+        * Enable kernel ctxts' receive and receive interrupt.
+        * Other ctxts done as user opens and initializes them.
+        */
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+                       rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+               hfi1_rcvctrl(dd, rcvmask, i);
+               sc_enable(dd->rcd[i]->sc);
+       }
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (!ppd->hfi1_wq) {
+                       ppd->hfi1_wq =
+                               alloc_workqueue(
+                                   "hfi%d_%d",
+                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+                                   dd->num_sdma,
+                                   dd->unit, pidx);
+                       if (!ppd->hfi1_wq)
+                               goto wq_error;
+               }
+       }
+       return 0;
+wq_error:
+       pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+       int ret = 0, pidx, lastfail = 0;
+       unsigned i, len;
+       struct hfi1_ctxtdata *rcd;
+       struct hfi1_pportdata *ppd;
+
+       /* Set up recv low level handlers */
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+                                               kdeth_process_expected;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+                                               kdeth_process_eager;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+                                               process_receive_error;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+                                               process_receive_bypass;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+                                               process_receive_invalid;
+       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+       /* Set up send low level handlers */
+       dd->process_pio_send = hfi1_verbs_send_pio;
+       dd->process_dma_send = hfi1_verbs_send_dma;
+       dd->pio_inline_send = pio_copy;
+
+       if (is_ax(dd)) {
+               atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+               dd->do_drop = 1;
+       } else {
+               atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+               dd->do_drop = 0;
+       }
+
+       /* make sure the link is not "up" */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               ppd->linkup = 0;
+       }
+
+       if (reinit)
+               ret = init_after_reset(dd);
+       else
+               ret = loadtime_init(dd);
+       if (ret)
+               goto done;
+
+       /* allocate dummy tail memory for all receive contexts */
+       dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
+               &dd->pcidev->dev, sizeof(u64),
+               &dd->rcvhdrtail_dummy_physaddr,
+               GFP_KERNEL);
+
+       if (!dd->rcvhdrtail_dummy_kvaddr) {
+               dd_dev_err(dd, "cannot allocate dummy tail memory\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       /* dd->rcd can be NULL if early initialization failed */
+       for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+               /*
+                * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+                * re-init, the simplest way to handle this is to free
+                * existing, and re-allocate.
+                * Need to re-create rest of ctxt 0 ctxtdata as well.
+                */
+               rcd = dd->rcd[i];
+               if (!rcd)
+                       continue;
+
+               rcd->do_interrupt = &handle_receive_interrupt;
+
+               lastfail = hfi1_create_rcvhdrq(dd, rcd);
+               if (!lastfail)
+                       lastfail = hfi1_setup_eagerbufs(rcd);
+               if (lastfail) {
+                       dd_dev_err(dd,
+                                  "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+                       ret = lastfail;
+               }
+       }
+
+       /* Allocate enough memory for user event notification. */
+       len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+                        sizeof(*dd->events));
+       dd->events = vmalloc_user(len);
+       if (!dd->events)
+               dd_dev_err(dd, "Failed to allocate user events page\n");
+       /*
+        * Allocate a page for device and port status.
+        * Page will be shared amongst all user processes.
+        */
+       dd->status = vmalloc_user(PAGE_SIZE);
+       if (!dd->status)
+               dd_dev_err(dd, "Failed to allocate dev status page\n");
+       else
+               dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+                                            sizeof(dd->status->freezemsg));
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (dd->status)
+                       /* Currently, we only have one port */
+                       ppd->statusp = &dd->status->port;
+
+               set_mtu(ppd);
+       }
+
+       /* enable chip even if we have an error, so we can debug cause */
+       enable_chip(dd);
+
+done:
+       /*
+        * Set status even if port serdes is not initialized
+        * so that diags will work.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+                       HFI1_STATUS_INITTED;
+       if (!ret) {
+               /* enable all interrupts from the chip */
+               set_intr_state(dd, 1);
+
+               /* chip is OK for user apps; mark it as initialized */
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+
+                       /*
+                        * start the serdes - must be after interrupts are
+                        * enabled so we are notified when the link goes up
+                        */
+                       lastfail = bringup_serdes(ppd);
+                       if (lastfail)
+                               dd_dev_info(dd,
+                                           "Failed to bring up port %u\n",
+                                           ppd->port);
+
+                       /*
+                        * Set status even if port serdes is not initialized
+                        * so that diags will work.
+                        */
+                       if (ppd->statusp)
+                               *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+                                                       HFI1_STATUS_INITTED;
+                       if (!ppd->link_speed_enabled)
+                               continue;
+               }
+       }
+
+       /* if ret is non-zero, we probably should do some cleanup here... */
+       return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+       return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       dd = __hfi1_lookup(unit);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int pidx;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->led_override_timer.data) {
+                       del_timer_sync(&ppd->led_override_timer);
+                       atomic_set(&ppd->led_override_timer_active, 0);
+               }
+       }
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       unsigned pidx;
+       int i;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               ppd->linkup = 0;
+               if (ppd->statusp)
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+       }
+       dd->flags &= ~HFI1_INITTED;
+
+       /* mask interrupts, but not errors */
+       set_intr_state(dd, 0);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               for (i = 0; i < dd->num_rcv_contexts; i++)
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+                                         HFI1_RCVCTRL_CTXT_DIS |
+                                         HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                         HFI1_RCVCTRL_PKEY_DIS |
+                                         HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+               /*
+                * Gracefully stop all sends allowing any in progress to
+                * trickle out first.
+                */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_flush(dd->send_contexts[i].sc);
+       }
+
+       /*
+        * Enough for anything that's going to trickle out to have actually
+        * done so.
+        */
+       udelay(20);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               /* disable all contexts */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_disable(dd->send_contexts[i].sc);
+               /* disable the send device */
+               pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+               shutdown_led_override(ppd);
+
+               /*
+                * Clear SerdesEnable.
+                * We can't count on interrupts since we are stopping.
+                */
+               hfi1_quiet_serdes(ppd);
+
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned e;
+
+       if (!rcd)
+               return;
+
+       if (rcd->rcvhdrq) {
+               dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+                                 rcd->rcvhdrq, rcd->rcvhdrq_phys);
+               rcd->rcvhdrq = NULL;
+               if (rcd->rcvhdrtail_kvaddr) {
+                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                         (void *)rcd->rcvhdrtail_kvaddr,
+                                         rcd->rcvhdrqtailaddr_phys);
+                       rcd->rcvhdrtail_kvaddr = NULL;
+               }
+       }
+
+       /* all the RcvArray entries should have been cleared by now */
+       kfree(rcd->egrbufs.rcvtids);
+
+       for (e = 0; e < rcd->egrbufs.alloced; e++) {
+               if (rcd->egrbufs.buffers[e].phys)
+                       dma_free_coherent(&dd->pcidev->dev,
+                                         rcd->egrbufs.buffers[e].len,
+                                         rcd->egrbufs.buffers[e].addr,
+                                         rcd->egrbufs.buffers[e].phys);
+       }
+       kfree(rcd->egrbufs.buffers);
+
+       sc_free(rcd->sc);
+       vfree(rcd->user_event_mask);
+       vfree(rcd->subctxt_uregbase);
+       vfree(rcd->subctxt_rcvegrbuf);
+       vfree(rcd->subctxt_rcvhdr_base);
+       kfree(rcd->opstats);
+       kfree(rcd);
+}
+
+/*
+ * Release our hold on the shared asic data.  If we are the last one,
+ * free the structure.  Must be holding hfi1_devs_lock.
+ */
+static void release_asic_data(struct hfi1_devdata *dd)
+{
+       int other;
+
+       if (!dd->asic_data)
+               return;
+       dd->asic_data->dds[dd->hfi1_id] = NULL;
+       other = dd->hfi1_id ? 0 : 1;
+       if (!dd->asic_data->dds[other]) {
+               /* we are the last holder, free it */
+               kfree(dd->asic_data);
+       }
+       dd->asic_data = NULL;
+}
+
+static void __hfi1_free_devdata(struct kobject *kobj)
+{
+       struct hfi1_devdata *dd =
+               container_of(kobj, struct hfi1_devdata, kobj);
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       idr_remove(&hfi1_unit_table, dd->unit);
+       list_del(&dd->list);
+       release_asic_data(dd);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       free_platform_config(dd);
+       rcu_barrier(); /* wait for rcu callbacks to complete */
+       free_percpu(dd->int_counter);
+       free_percpu(dd->rcv_limit);
+       hfi1_dev_affinity_free(dd);
+       free_percpu(dd->send_schedule);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
+}
+
+static struct kobj_type hfi1_devdata_type = {
+       .release = __hfi1_free_devdata,
+};
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+       kobject_put(&dd->kobj);
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       int ret, nports;
+
+       /* extra is * number of ports */
+       nports = extra / sizeof(struct hfi1_pportdata);
+
+       dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+                                                    nports);
+       if (!dd)
+               return ERR_PTR(-ENOMEM);
+       dd->num_pports = nports;
+       dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+       INIT_LIST_HEAD(&dd->list);
+       idr_preload(GFP_KERNEL);
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+       if (ret >= 0) {
+               dd->unit = ret;
+               list_add(&dd->list, &hfi1_dev_list);
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       idr_preload_end();
+
+       if (ret < 0) {
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate unit ID: error %d\n", -ret);
+               goto bail;
+       }
+       /*
+        * Initialize all locks for the device. This needs to be as early as
+        * possible so locks are usable.
+        */
+       spin_lock_init(&dd->sc_lock);
+       spin_lock_init(&dd->sendctrl_lock);
+       spin_lock_init(&dd->rcvctrl_lock);
+       spin_lock_init(&dd->uctxt_lock);
+       spin_lock_init(&dd->hfi1_diag_trans_lock);
+       spin_lock_init(&dd->sc_init_lock);
+       spin_lock_init(&dd->dc8051_lock);
+       spin_lock_init(&dd->dc8051_memlock);
+       seqlock_init(&dd->sc2vl_lock);
+       spin_lock_init(&dd->sde_map_lock);
+       spin_lock_init(&dd->pio_map_lock);
+       init_waitqueue_head(&dd->event_queue);
+
+       dd->int_counter = alloc_percpu(u64);
+       if (!dd->int_counter) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
+
+       dd->rcv_limit = alloc_percpu(u64);
+       if (!dd->rcv_limit) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu rcv_limit\n");
+               goto bail;
+       }
+
+       dd->send_schedule = alloc_percpu(u64);
+       if (!dd->send_schedule) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
+
+       if (!hfi1_cpulist_count) {
+               u32 count = num_online_cpus();
+
+               hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
+                                      GFP_KERNEL);
+               if (hfi1_cpulist)
+                       hfi1_cpulist_count = count;
+               else
+                       hfi1_early_err(
+                       &pdev->dev,
+                       "Could not alloc cpulist info, cpu affinity might be wrong\n");
+       }
+       kobject_init(&dd->kobj, &hfi1_devdata_type);
+       return dd;
+
+bail:
+       if (!list_empty(&dd->list))
+               list_del_init(&dd->list);
+       rvt_dealloc_device(&dd->verbs_dev.rdi);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+       if (dd->flags & HFI1_INITTED) {
+               u32 pidx;
+
+               dd->flags &= ~HFI1_INITTED;
+               if (dd->pport)
+                       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                               struct hfi1_pportdata *ppd;
+
+                               ppd = dd->pport + pidx;
+                               if (dd->flags & HFI1_PRESENT)
+                                       set_link_state(ppd, HLS_DN_DISABLE);
+
+                               if (ppd->statusp)
+                                       *ppd->statusp &= ~HFI1_STATUS_IB_READY;
+                       }
+       }
+
+       /*
+        * Mark as having had an error for driver, and also
+        * for /sys and status word mapped to user programs.
+        * This marks unit as not usable, until reset.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+static const struct pci_device_id hfi1_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+       { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+       .name = DRIVER_NAME,
+       .probe = init_one,
+       .remove = remove_one,
+       .id_table = hfi1_pci_tbl,
+       .err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+       int i;
+
+       for (i = 0; i < krcvqsset; i++)
+               n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+       int ret;
+
+       ret = dev_init();
+       if (ret)
+               goto bail;
+
+       /* validate max MTU before any devices start */
+       if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+               pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+                      hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+               hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+       }
+       /* valid CUs run from 1-128 in powers of 2 */
+       if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+               hfi1_cu = 1;
+       /* valid credit return threshold is 0-100, variable is unsigned */
+       if (user_credit_return_threshold > 100)
+               user_credit_return_threshold = 100;
+
+       compute_krcvqs();
+       /*
+        * sanitize receive interrupt count, time must wait until after
+        * the hardware type is known
+        */
+       if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+               rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+       /* reject invalid combinations */
+       if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+               pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+               rcv_intr_count = 1;
+       }
+       if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+               /*
+                * Avoid indefinite packet delivery by requiring a timeout
+                * if count is > 1.
+                */
+               pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+               rcv_intr_timeout = 1;
+       }
+       if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+               /*
+                * The dynamic algorithm expects a non-zero timeout
+                * and a count > 1.
+                */
+               pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+               rcv_intr_dynamic = 0;
+       }
+
+       /* sanitize link CRC options */
+       link_crc_mask &= SUPPORTED_CRCS;
+
+       /*
+        * These must be called before the driver is registered with
+        * the PCI subsystem.
+        */
+       idr_init(&hfi1_unit_table);
+
+       hfi1_dbg_init();
+       ret = hfi1_wss_init();
+       if (ret < 0)
+               goto bail_wss;
+       ret = pci_register_driver(&hfi1_pci_driver);
+       if (ret < 0) {
+               pr_err("Unable to register driver: error %d\n", -ret);
+               goto bail_dev;
+       }
+       goto bail; /* all OK */
+
+bail_dev:
+       hfi1_wss_exit();
+bail_wss:
+       hfi1_dbg_exit();
+       idr_destroy(&hfi1_unit_table);
+       dev_cleanup();
+bail:
+       return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+       pci_unregister_driver(&hfi1_pci_driver);
+       hfi1_wss_exit();
+       hfi1_dbg_exit();
+       hfi1_cpulist_count = 0;
+       kfree(hfi1_cpulist);
+
+       idr_destroy(&hfi1_unit_table);
+       dispose_firmware();     /* asymmetric with obtain_firmware() */
+       dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+       int ctxt;
+       int pidx;
+       struct hfi1_ctxtdata **tmp;
+       unsigned long flags;
+
+       /* users can't do anything more with chip */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               struct hfi1_pportdata *ppd = &dd->pport[pidx];
+               struct cc_state *cc_state;
+               int i;
+
+               if (ppd->statusp)
+                       *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+               for (i = 0; i < OPA_MAX_SLS; i++)
+                       hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+               spin_lock(&ppd->cc_state_lock);
+               cc_state = get_cc_state(ppd);
+               RCU_INIT_POINTER(ppd->cc_state, NULL);
+               spin_unlock(&ppd->cc_state_lock);
+
+               if (cc_state)
+                       call_rcu(&cc_state->rcu, cc_state_reclaim);
+       }
+
+       free_credit_return(dd);
+
+       /*
+        * Free any resources still in use (usually just kernel contexts)
+        * at unload; we do for ctxtcnt, because that's what we allocate.
+        * We acquire lock to be really paranoid that rcd isn't being
+        * accessed from some interrupt-related code (that should not happen,
+        * but best to be sure).
+        */
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       tmp = dd->rcd;
+       dd->rcd = NULL;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       if (dd->rcvhdrtail_dummy_kvaddr) {
+               dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
+                                 (void *)dd->rcvhdrtail_dummy_kvaddr,
+                                 dd->rcvhdrtail_dummy_physaddr);
+                                 dd->rcvhdrtail_dummy_kvaddr = NULL;
+       }
+
+       for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+               struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+               tmp[ctxt] = NULL; /* debugging paranoia */
+               if (rcd) {
+                       hfi1_clear_tids(rcd);
+                       hfi1_free_ctxtdata(dd, rcd);
+               }
+       }
+       kfree(tmp);
+       free_pio_map(dd);
+       /* must follow rcv context free - need to remove rcv's hooks */
+       for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+               sc_free(dd->send_contexts[ctxt].sc);
+       dd->num_send_contexts = 0;
+       kfree(dd->send_contexts);
+       dd->send_contexts = NULL;
+       kfree(dd->hw_to_sw);
+       dd->hw_to_sw = NULL;
+       kfree(dd->boardname);
+       vfree(dd->events);
+       vfree(dd->status);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+       hfi1_start_cleanup(dd);
+
+       hfi1_pcie_ddcleanup(dd);
+       hfi1_pcie_cleanup(dd->pcidev);
+
+       cleanup_device_data(dd);
+
+       hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret = 0, j, pidx, initfail;
+       struct hfi1_devdata *dd = NULL;
+       struct hfi1_pportdata *ppd;
+
+       /* First, lock the non-writable module parameters */
+       HFI1_CAP_LOCK();
+
+       /* Validate some global module parameters */
+       if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev,
+                              "Receive header queue count cannot be greater than %u\n",
+                              HFI1_MAX_HDRQ_EGRBUF_CNT);
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* use the encoding function as a sanitization check */
+       if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+               hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+                              hfi1_hdrq_entsize);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* The receive eager buffer size must be set before the receive
+        * contexts are created.
+        *
+        * Set the eager buffer size.  Validate that it falls in a range
+        * allowed by the hardware - all powers of 2 between the min and
+        * max.  The maximum valid MTU is within the eager buffer range
+        * so we do not need to cap the max_mtu by an eager buffer size
+        * setting.
+        */
+       if (eager_buffer_size) {
+               if (!is_power_of_2(eager_buffer_size))
+                       eager_buffer_size =
+                               roundup_pow_of_two(eager_buffer_size);
+               eager_buffer_size =
+                       clamp_val(eager_buffer_size,
+                                 MIN_EAGER_BUFFER * 8,
+                                 MAX_EAGER_BUFFER_TOTAL);
+               hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+                               eager_buffer_size);
+       } else {
+               hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* restrict value of hfi1_rcvarr_split */
+       hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+       ret = hfi1_pcie_init(pdev, ent);
+       if (ret)
+               goto bail;
+
+       /*
+        * Do device-specific initialization, function table setup, dd
+        * allocation, etc.
+        */
+       switch (ent->device) {
+       case PCI_DEVICE_ID_INTEL0:
+       case PCI_DEVICE_ID_INTEL1:
+               dd = hfi1_init_dd(pdev, ent);
+               break;
+       default:
+               hfi1_early_err(&pdev->dev,
+                              "Failing on unknown Intel deviceid 0x%x\n",
+                              ent->device);
+               ret = -ENODEV;
+       }
+
+       if (IS_ERR(dd))
+               ret = PTR_ERR(dd);
+       if (ret)
+               goto clean_bail; /* error already printed */
+
+       ret = create_workqueues(dd);
+       if (ret)
+               goto clean_bail;
+
+       /* do the generic initialization */
+       initfail = hfi1_init(dd, 0);
+
+       ret = hfi1_register_ib_device(dd);
+
+       /*
+        * Now ready for use.  this should be cleared whenever we
+        * detect a reset, or initiate one.  If earlier failure,
+        * we still create devices, so diags, etc. can be used
+        * to determine cause of problem.
+        */
+       if (!initfail && !ret) {
+               dd->flags |= HFI1_INITTED;
+               /* create debufs files after init and ib register */
+               hfi1_dbg_ibdev_init(&dd->verbs_dev);
+       }
+
+       j = hfi1_device_create(dd);
+       if (j)
+               dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+       if (initfail || ret) {
+               stop_timers(dd);
+               flush_workqueue(ib_wq);
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       hfi1_quiet_serdes(dd->pport + pidx);
+                       ppd = dd->pport + pidx;
+                       if (ppd->hfi1_wq) {
+                               destroy_workqueue(ppd->hfi1_wq);
+                               ppd->hfi1_wq = NULL;
+                       }
+               }
+               if (!j)
+                       hfi1_device_remove(dd);
+               if (!ret)
+                       hfi1_unregister_ib_device(dd);
+               postinit_cleanup(dd);
+               if (initfail)
+                       ret = initfail;
+               goto bail;      /* everything already cleaned */
+       }
+
+       sdma_start(dd);
+
+       return 0;
+
+clean_bail:
+       hfi1_pcie_cleanup(pdev);
+bail:
+       return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       /* close debugfs files before ib unregister */
+       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+       /* unregister from IB core */
+       hfi1_unregister_ib_device(dd);
+
+       /*
+        * Disable the IB link, disable interrupts on the device,
+        * clear dma engines, etc.
+        */
+       shutdown_device(dd);
+
+       stop_timers(dd);
+
+       /* wait until all of our (qsfp) queue_work() calls complete */
+       flush_workqueue(ib_wq);
+
+       hfi1_device_remove(dd);
+
+       postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned amt;
+       u64 reg;
+
+       if (!rcd->rcvhdrq) {
+               dma_addr_t phys_hdrqtail;
+               gfp_t gfp_flags;
+
+               /*
+                * rcvhdrqentsize is in DWs, so we have to convert to bytes
+                * (* sizeof(u32)).
+                */
+               amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+                                sizeof(u32));
+
+               gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+                       GFP_USER : GFP_KERNEL;
+               rcd->rcvhdrq = dma_zalloc_coherent(
+                       &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+                       gfp_flags | __GFP_COMP);
+
+               if (!rcd->rcvhdrq) {
+                       dd_dev_err(dd,
+                                  "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+                                  amt, rcd->ctxt);
+                       goto bail;
+               }
+
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+                       rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+                               gfp_flags);
+                       if (!rcd->rcvhdrtail_kvaddr)
+                               goto bail_free;
+                       rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+               }
+
+               rcd->rcvhdrq_size = amt;
+       }
+       /*
+        * These values are per-context:
+        *      RcvHdrCnt
+        *      RcvHdrEntSize
+        *      RcvHdrSize
+        */
+       reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+                       & RCV_HDR_CNT_CNT_MASK)
+               << RCV_HDR_CNT_CNT_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+       reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+                       & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+               << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+       reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+               << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+
+       /*
+        * Program dummy tail address for every receive context
+        * before enabling any receive context
+        */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
+                       dd->rcvhdrtail_dummy_physaddr);
+
+       return 0;
+
+bail_free:
+       dd_dev_err(dd,
+                  "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+                  rcd->ctxt);
+       vfree(rcd->user_event_mask);
+       rcd->user_event_mask = NULL;
+       dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+                         rcd->rcvhdrq_phys);
+       rcd->rcvhdrq = NULL;
+bail:
+       return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+       gfp_t gfp_flags;
+       u16 order;
+       int ret = 0;
+       u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+       /*
+        * GFP_USER, but without GFP_FS, so buffer cache can be
+        * coalesced (we hope); otherwise, even at order 4,
+        * heavy filesystem activity makes these fail, and we can
+        * use compound pages.
+        */
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
+
+       /*
+        * The minimum size of the eager buffers is a groups of MTU-sized
+        * buffers.
+        * The global eager_buffer_size parameter is checked against the
+        * theoretical lower limit of the value. Here, we check against the
+        * MTU.
+        */
+       if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+               rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+       /*
+        * If using one-pkt-per-egr-buffer, lower the eager buffer
+        * size to the max MTU (page-aligned).
+        */
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+               rcd->egrbufs.rcvtid_size = round_mtu;
+
+       /*
+        * Eager buffers sizes of 1MB or less require smaller TID sizes
+        * to satisfy the "multiple of 8 RcvArray entries" requirement.
+        */
+       if (rcd->egrbufs.size <= (1 << 20))
+               rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+                       rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+       while (alloced_bytes < rcd->egrbufs.size &&
+              rcd->egrbufs.alloced < rcd->egrbufs.count) {
+               rcd->egrbufs.buffers[idx].addr =
+                       dma_zalloc_coherent(&dd->pcidev->dev,
+                                           rcd->egrbufs.rcvtid_size,
+                                           &rcd->egrbufs.buffers[idx].phys,
+                                           gfp_flags);
+               if (rcd->egrbufs.buffers[idx].addr) {
+                       rcd->egrbufs.buffers[idx].len =
+                               rcd->egrbufs.rcvtid_size;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+                               rcd->egrbufs.buffers[idx].addr;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+                               rcd->egrbufs.buffers[idx].phys;
+                       rcd->egrbufs.alloced++;
+                       alloced_bytes += rcd->egrbufs.rcvtid_size;
+                       idx++;
+               } else {
+                       u32 new_size, i, j;
+                       u64 offset = 0;
+
+                       /*
+                        * Fail the eager buffer allocation if:
+                        *   - we are already using the lowest acceptable size
+                        *   - we are using one-pkt-per-egr-buffer (this implies
+                        *     that we are accepting only one size)
+                        */
+                       if (rcd->egrbufs.rcvtid_size == round_mtu ||
+                           !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+                               dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+                                          rcd->ctxt);
+                               goto bail_rcvegrbuf_phys;
+                       }
+
+                       new_size = rcd->egrbufs.rcvtid_size / 2;
+
+                       /*
+                        * If the first attempt to allocate memory failed, don't
+                        * fail everything but continue with the next lower
+                        * size.
+                        */
+                       if (idx == 0) {
+                               rcd->egrbufs.rcvtid_size = new_size;
+                               continue;
+                       }
+
+                       /*
+                        * Re-partition already allocated buffers to a smaller
+                        * size.
+                        */
+                       rcd->egrbufs.alloced = 0;
+                       for (i = 0, j = 0, offset = 0; j < idx; i++) {
+                               if (i >= rcd->egrbufs.count)
+                                       break;
+                               rcd->egrbufs.rcvtids[i].phys =
+                                       rcd->egrbufs.buffers[j].phys + offset;
+                               rcd->egrbufs.rcvtids[i].addr =
+                                       rcd->egrbufs.buffers[j].addr + offset;
+                               rcd->egrbufs.alloced++;
+                               if ((rcd->egrbufs.buffers[j].phys + offset +
+                                    new_size) ==
+                                   (rcd->egrbufs.buffers[j].phys +
+                                    rcd->egrbufs.buffers[j].len)) {
+                                       j++;
+                                       offset = 0;
+                               } else {
+                                       offset += new_size;
+                               }
+                       }
+                       rcd->egrbufs.rcvtid_size = new_size;
+               }
+       }
+       rcd->egrbufs.numbufs = idx;
+       rcd->egrbufs.size = alloced_bytes;
+
+       hfi1_cdbg(PROC,
+                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
+                 rcd->egrbufs.size);
+
+       /*
+        * Set the contexts rcv array head update threshold to the closest
+        * power of 2 (so we can use a mask instead of modulo) below half
+        * the allocated entries.
+        */
+       rcd->egrbufs.threshold =
+               rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+       /*
+        * Compute the expected RcvArray entry base. This is done after
+        * allocating the eager buffers in order to maximize the
+        * expected RcvArray entries for the context.
+        */
+       max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+       egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+       rcd->expected_count = max_entries - egrtop;
+       if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+               rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+       rcd->expected_base = rcd->eager_base + egrtop;
+       hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+                 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+                 rcd->eager_base, rcd->expected_base);
+
+       if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+               hfi1_cdbg(PROC,
+                         "ctxt%u: current Eager buffer size is invalid %u\n",
+                         rcd->ctxt, rcd->egrbufs.rcvtid_size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+               hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+                            rcd->egrbufs.rcvtids[idx].phys, order);
+               cond_resched();
+       }
+       goto bail;
+
+bail_rcvegrbuf_phys:
+       for (idx = 0; idx < rcd->egrbufs.alloced &&
+            rcd->egrbufs.buffers[idx].addr;
+            idx++) {
+               dma_free_coherent(&dd->pcidev->dev,
+                                 rcd->egrbufs.buffers[idx].len,
+                                 rcd->egrbufs.buffers[idx].addr,
+                                 rcd->egrbufs.buffers[idx].phys);
+               rcd->egrbufs.buffers[idx].addr = NULL;
+               rcd->egrbufs.buffers[idx].phys = 0;
+               rcd->egrbufs.buffers[idx].len = 0;
+       }
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c
new file mode 100644 (file)
index 0000000..65348d1
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+       strlcat(msg, "[", msgl);
+       strlcat(msg, hwmsg, msgl);
+       strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+       int i;
+
+       for (i = 0; i < nhwerrmsgs; i++)
+               if (hwerrs & hwerrmsgs[i].mask)
+                       format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+       struct ib_event event;
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Only call ib_dispatch_event() if the IB device has been
+        * registered.  HFI1_INITED is set iff the driver has successfully
+        * registered with the IB core.
+        */
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+       event.device = &dd->verbs_dev.rdi.ibdev;
+       event.element.port_num = ppd->port;
+       event.event = ev;
+       ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       enum ib_event_type ev;
+
+       if (!(ppd->linkup ^ !!linkup))
+               return; /* no change, nothing to do */
+
+       if (linkup) {
+               /*
+                * Quick linkup and all link up on the simulator does not
+                * trigger or implement:
+                *      - VerifyCap interrupt
+                *      - VerifyCap frames
+                * But rather moves directly to LinkUp.
+                *
+                * Do the work of the VerifyCap interrupt handler,
+                * handle_verify_cap(), but do not try moving the state to
+                * LinkUp as we are already there.
+                *
+                * NOTE: This uses this device's vAU, vCU, and vl15_init for
+                * the remote values.  Both sides must be using the values.
+                */
+               if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+                       set_up_vl15(dd, dd->vau, dd->vl15_init);
+                       assign_remote_cm_au_table(dd, dd->vcu);
+                       ppd->neighbor_guid =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+                       ppd->neighbor_type =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+                                       DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+                       ppd->neighbor_port_number =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                        DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+                       dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n",
+                                   ppd->neighbor_guid,
+                                   ppd->neighbor_type);
+               }
+
+               /* physical link went up */
+               ppd->linkup = 1;
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+               /* link widths are not available until the link is fully up */
+               get_linkup_link_widths(ppd);
+
+       } else {
+               /* physical link went down */
+               ppd->linkup = 0;
+
+               /* clear HW details of the previous connection */
+               reset_link_credits(dd);
+
+               /* freeze after a link down to guarantee a clean egress */
+               start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
+
+               ev = IB_EVENT_PORT_ERR;
+
+               hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+               /* if we are down, the neighbor is down */
+               ppd->neighbor_normal = 0;
+
+               /* notify IB of the link change */
+               signal_ib_event(ppd, ev);
+       }
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts.  This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (!rcd->cnt)
+               goto done;
+
+       if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+               wake_up_interruptible(&rcd->wait);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+       } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+                                                       &rcd->event_flags)) {
+               rcd->urgent++;
+               wake_up_interruptible(&rcd->wait);
+       }
+done:
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
new file mode 100644 (file)
index 0000000..2ec6ef3
--- /dev/null
@@ -0,0 +1,300 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sdma_txreq.h"
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+       struct list_head list;
+       struct list_head tx_head;
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq);
+       void (*wakeup)(struct iowait *wait, int reason);
+       void (*sdma_drained)(struct iowait *wait);
+       struct work_struct iowork;
+       wait_queue_head_t wait_dma;
+       wait_queue_head_t wait_pio;
+       atomic_t sdma_busy;
+       atomic_t pio_busy;
+       u32 count;
+       u32 tx_limit;
+       u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+       struct iowait *wait,
+       u32 tx_limit,
+       void (*func)(struct work_struct *work),
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq),
+       void (*wakeup)(struct iowait *wait, int reason),
+       void (*sdma_drained)(struct iowait *wait))
+{
+       wait->count = 0;
+       INIT_LIST_HEAD(&wait->list);
+       INIT_LIST_HEAD(&wait->tx_head);
+       INIT_WORK(&wait->iowork, func);
+       init_waitqueue_head(&wait->wait_dma);
+       init_waitqueue_head(&wait->wait_pio);
+       atomic_set(&wait->sdma_busy, 0);
+       atomic_set(&wait->pio_busy, 0);
+       wait->tx_limit = tx_limit;
+       wait->sleep = sleep;
+       wait->wakeup = wakeup;
+       wait->sdma_drained = sdma_drained;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ * @cpu: cpu
+ */
+static inline void iowait_schedule(
+       struct iowait *wait,
+       struct workqueue_struct *wq,
+       int cpu)
+{
+       queue_work_on(cpu, wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+       wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+       atomic_add(count, &wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_dec - note sdma complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+       wait_event_timeout(wait->wait_pio,
+                          !atomic_read(&wait->pio_busy),
+                          HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *wait)
+{
+       return atomic_read(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+       atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+       return atomic_dec_and_test(&wait->pio_busy);
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+       wake_up(&wait->wait_dma);
+       wake_up(&wait->wait_pio);
+       if (wait->sdma_drained)
+               wait->sdma_drained(wait);
+}
+
+/**
+ * iowait_get_txhead() - get packet off of iowait list
+ *
+ * @wait wait struture
+ */
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+{
+       struct sdma_txreq *tx = NULL;
+
+       if (!list_empty(&wait->tx_head)) {
+               tx = list_first_entry(
+                       &wait->tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&tx->list);
+       }
+       return tx;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
new file mode 100644 (file)
index 0000000..2190295
--- /dev/null
@@ -0,0 +1,4449 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+                       / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+#include "qp.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+       /*
+        * The verbs framework will handle the directed/LID route
+        * packet changes.
+        */
+       smp->method = IB_MGMT_METHOD_GET_RESP;
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               smp->status |= IB_SMP_DIRECTION;
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+       void *data = opa_get_smp_data(smp);
+       size_t size = opa_get_smp_data_size(smp);
+
+       memset(data, 0, size);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+       struct ib_mad_send_buf *send_buf;
+       struct ib_mad_agent *agent;
+       struct opa_smp *smp;
+       int ret;
+       unsigned long flags;
+       unsigned long timeout;
+       int pkey_idx;
+       u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+       agent = ibp->rvp.send_agent;
+       if (!agent)
+               return;
+
+       /* o14-3.2.1 */
+       if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+               return;
+
+       /* o14-2 */
+       if (ibp->rvp.trap_timeout && time_before(jiffies,
+                                                ibp->rvp.trap_timeout))
+               return;
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       __func__, hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+
+       send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+                                     IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+                                     GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+       if (IS_ERR(send_buf))
+               return;
+
+       smp = send_buf->mad;
+       smp->base_version = OPA_MGMT_BASE_VERSION;
+       smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+       smp->class_version = OPA_SMI_CLASS_VERSION;
+       smp->method = IB_MGMT_METHOD_TRAP;
+       ibp->rvp.tid++;
+       smp->tid = cpu_to_be64(ibp->rvp.tid);
+       smp->attr_id = IB_SMP_ATTR_NOTICE;
+       /* o14-1: smp->mkey = 0; */
+       memcpy(smp->route.lid.data, data, len);
+
+       spin_lock_irqsave(&ibp->rvp.lock, flags);
+       if (!ibp->rvp.sm_ah) {
+               if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+                       struct ib_ah *ah;
+
+                       ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+                       if (IS_ERR(ah)) {
+                               ret = PTR_ERR(ah);
+                       } else {
+                               send_buf->ah = ah;
+                               ibp->rvp.sm_ah = ibah_to_rvtah(ah);
+                               ret = 0;
+                       }
+               } else {
+                       ret = -EINVAL;
+               }
+       } else {
+               send_buf->ah = &ibp->rvp.sm_ah->ibah;
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+       if (!ret)
+               ret = ib_post_send_mad(send_buf, NULL);
+       if (!ret) {
+               /* 4.096 usec. */
+               timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
+               ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
+       } else {
+               ib_free_send_mad(send_buf);
+               ibp->rvp.trap_timeout = 0;
+       }
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+       u32 _lid1 = lid1;
+       u32 _lid2 = lid2;
+
+       memset(&data, 0, sizeof(data));
+
+       if (trap_num == OPA_TRAP_BAD_P_KEY)
+               ibp->rvp.pkey_violations++;
+       else
+               ibp->rvp.qkey_violations++;
+       ibp->rvp.n_pkt_drops++;
+
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = trap_num;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
+       data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+       data.ntc_257_258.key = cpu_to_be32(key);
+       data.ntc_257_258.sl = sl << 3;
+       data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+       data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                    __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_BAD_M_KEY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_256.lid = data.issuer_lid;
+       data.ntc_256.method = mad->method;
+       data.ntc_256.attr_id = mad->attr_id;
+       data.ntc_256.attr_mod = mad->attr_mod;
+       data.ntc_256.mkey = mkey;
+       if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               data.ntc_256.dr_slid = dr_slid;
+               data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+               if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
+                       data.ntc_256.dr_trunc_hop |=
+                               IB_NOTICE_TRAP_DR_TRUNC;
+                       hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
+               }
+               data.ntc_256.dr_trunc_hop |= hop_cnt;
+               memcpy(data.ntc_256.dr_rtn_path, return_path,
+                      hop_cnt);
+       }
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+{
+       struct opa_mad_notice_attr data;
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_144.lid = data.issuer_lid;
+       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+       data.ntc_145.lid = data.issuer_lid;
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+       struct opa_mad_notice_attr data;
+       u32 lid = ppd_from_ibp(ibp)->lid;
+
+       memset(&data, 0, sizeof(data));
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+       data.issuer_lid = cpu_to_be32(lid);
+       data.ntc_144.lid = data.issuer_lid;
+       data.ntc_144.change_flags =
+               cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+                                  u8 *data, struct ib_device *ibdev,
+                                  u8 port, u32 *resp_len)
+{
+       struct opa_node_description *nd;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       nd = (struct opa_node_description *)data;
+
+       memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+       if (resp_len)
+               *resp_len += sizeof(*nd);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_node_info *ni;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       ni = (struct opa_node_info *)data;
+
+       /* GUID 0 is illegal */
+       if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+       ni->base_version = OPA_MGMT_BASE_VERSION;
+       ni->class_version = OPA_SMI_CLASS_VERSION;
+       ni->node_type = 1;     /* channel adapter */
+       ni->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       ni->system_image_guid = ib_hfi1_sys_image_guid;
+       /* Use first-port GUID as node */
+       ni->node_guid = cpu_to_be64(dd->pport->guid);
+       ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       ni->device_id = cpu_to_be16(dd->pcidev->device);
+       ni->revision = cpu_to_be32(dd->minrev);
+       ni->local_port_num = port;
+       ni->vendor_id[0] = dd->oui1;
+       ni->vendor_id[1] = dd->oui2;
+       ni->vendor_id[2] = dd->oui3;
+
+       if (resp_len)
+               *resp_len += sizeof(*ni);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+                            u8 port)
+{
+       struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       /* GUID 0 is illegal */
+       if (smp->attr_mod || pidx >= dd->num_pports ||
+           dd->pport[pidx].guid == 0)
+               smp->status |= IB_SMP_INVALID_FIELD;
+       else
+               nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+       nip->base_version = OPA_MGMT_BASE_VERSION;
+       nip->class_version = OPA_SMI_CLASS_VERSION;
+       nip->node_type = 1;     /* channel adapter */
+       nip->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       nip->sys_guid = ib_hfi1_sys_image_guid;
+        /* Use first-port GUID as node */
+       nip->node_guid = cpu_to_be64(dd->pport->guid);
+       nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       nip->device_id = cpu_to_be16(dd->pcidev->device);
+       nip->revision = cpu_to_be32(dd->minrev);
+       nip->local_port_num = port;
+       nip->vendor_id[0] = dd->oui1;
+       nip->vendor_id[1] = dd->oui2;
+       nip->vendor_id[2] = dd->oui3;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                     int mad_flags, __be64 mkey, __be32 dr_slid,
+                     u8 return_path[], u8 hop_cnt)
+{
+       int valid_mkey = 0;
+       int ret = 0;
+
+       /* Is the mkey in the process of expiring? */
+       if (ibp->rvp.mkey_lease_timeout &&
+           time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
+               /* Clear timeout and mkey protection field. */
+               ibp->rvp.mkey_lease_timeout = 0;
+               ibp->rvp.mkeyprot = 0;
+       }
+
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
+           ibp->rvp.mkey == mkey)
+               valid_mkey = 1;
+
+       /* Unset lease timeout on any valid Get/Set/TrapRepress */
+       if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
+           (mad->method == IB_MGMT_METHOD_GET ||
+            mad->method == IB_MGMT_METHOD_SET ||
+            mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+               ibp->rvp.mkey_lease_timeout = 0;
+
+       if (!valid_mkey) {
+               switch (mad->method) {
+               case IB_MGMT_METHOD_GET:
+                       /* Bad mkey not a violation below level 2 */
+                       if (ibp->rvp.mkeyprot < 2)
+                               break;
+               case IB_MGMT_METHOD_SET:
+               case IB_MGMT_METHOD_TRAP_REPRESS:
+                       if (ibp->rvp.mkey_violations != 0xFFFF)
+                               ++ibp->rvp.mkey_violations;
+                       if (!ibp->rvp.mkey_lease_timeout &&
+                           ibp->rvp.mkey_lease_period)
+                               ibp->rvp.mkey_lease_timeout = jiffies +
+                                       ibp->rvp.mkey_lease_period * HZ;
+                       /* Generate a trap notice. */
+                       bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+                                hop_cnt);
+                       ret = 1;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+       u32 off;
+       u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+       { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       lcb_cache[i].val = val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       *val = lcb_cache[i].val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+               dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+       else
+               write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       int i;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       u8 mtu;
+       u8 credit_rate;
+       u8 is_beaconing_active;
+       u32 state;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 buffer_units;
+       u64 tmp = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       pi->lid = cpu_to_be32(ppd->lid);
+
+       /* Only return the mkey if the protection field allows it. */
+       if (!(smp->method == IB_MGMT_METHOD_GET &&
+             ibp->rvp.mkey != smp->mkey &&
+             ibp->rvp.mkeyprot == 1))
+               pi->mkey = ibp->rvp.mkey;
+
+       pi->subnet_prefix = ibp->rvp.gid_prefix;
+       pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
+       pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+       pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
+       pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+       pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+       pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+       pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+       pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+       pi->link_width_downgrade.supported =
+                       cpu_to_be16(ppd->link_width_downgrade_supported);
+       pi->link_width_downgrade.enabled =
+                       cpu_to_be16(ppd->link_width_downgrade_enabled);
+       pi->link_width_downgrade.tx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_tx_active);
+       pi->link_width_downgrade.rx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+       pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+       pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+       pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+       state = driver_lstate(ppd);
+
+       if (start_of_sm_config && (state == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+       pi->port_phys_conf = (ppd->port_type & 0xf);
+
+#if PI_LED_ENABLE_SUP
+       pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason;
+#else
+       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
+#endif /* PI_LED_ENABLE_SUP */
+
+       pi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | state;
+
+       pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
+
+       memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+       for (i = 0; i < ppd->vls_supported; i++) {
+               mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+               if ((i % 2) == 0)
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
+               else
+                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
+       }
+       /* don't forget VL 15 */
+       mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+       pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
+       pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
+       pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+       pi->partenforce_filterraw |=
+               (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+       pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
+       /* P_KeyViolations are counted by hardware. */
+       pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+       pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
+
+       pi->vl.cap = ppd->vls_supported;
+       pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
+       pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+       pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+       pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
+
+       pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+                                         OPA_PORT_LINK_MODE_OPA << 5 |
+                                         OPA_PORT_LINK_MODE_OPA);
+
+       pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+       pi->port_mode = cpu_to_be16(
+                               ppd->is_active_optimize_enabled ?
+                                       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+       pi->port_packet_format.supported =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+       pi->port_packet_format.enabled =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+       /* flit_control.interleave is (OPA V1, version .76):
+        * bits         use
+        * ----         ---
+        * 2            res
+        * 2            DistanceSupported
+        * 2            DistanceEnabled
+        * 5            MaxNextLevelTxEnabled
+        * 5            MaxNestLevelRxSupported
+        *
+        * HFI supports only "distance mode 1" (see OPA V1, version .76,
+        * section 9.6.2), so set DistanceSupported, DistanceEnabled
+        * to 0x1.
+        */
+       pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+       pi->link_down_reason = ppd->local_link_down_reason.sma;
+       pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+       pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+       pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+       /* 32.768 usec. response time (guessing) */
+       pi->resptimevalue = 3;
+
+       pi->local_port_num = port;
+
+       /* buffer info for FM */
+       pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+       pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+       pi->neigh_port_num = ppd->neighbor_port_number;
+       pi->port_neigh_mode =
+               (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+               (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+               (ppd->neighbor_fm_security ?
+                       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+       /* HFIs shall always return VL15 credits to their
+        * neighbor in a timely manner, without any credit return pacing.
+        */
+       credit_rate = 0;
+       buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+       buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+       buffer_units |= (credit_rate << 6) &
+                               OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+       buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+       pi->buffer_units = cpu_to_be32(buffer_units);
+
+       pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+       /* HFI supports a replay buffer 128 LTPs in size */
+       pi->replay_depth.buffer = 0x80;
+       /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+       /*
+        * this counter is 16 bits wide, but the replay_depth.wire
+        * variable is only 8 bits
+        */
+       if (tmp > 0xff)
+               tmp = 0xff;
+       pi->replay_depth.wire = tmp;
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+       memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+       return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_req = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       __be16 *p;
+       u16 *q;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+       size_t size;
+
+       if (n_blocks_req == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_req);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+       if (start_block + n_blocks_req > n_blocks_avail ||
+           n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+                       "avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_req, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       p = (__be16 *)data;
+       q = (u16 *)data;
+       /* get the real pkeys if we are requesting the first block */
+       if (start_block == 0) {
+               get_pkeys(dd, port, q);
+               for (i = 0; i < npkeys; i++)
+                       p[i] = cpu_to_be16(q[i]);
+               if (resp_len)
+                       *resp_len += size;
+       } else {
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+       HFI_TRANSITION_DISALLOWED,
+       HFI_TRANSITION_IGNORED,
+       HFI_TRANSITION_ALLOWED,
+       HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+       __D = HFI_TRANSITION_DISALLOWED,
+       __I = HFI_TRANSITION_IGNORED,
+       __A = HFI_TRANSITION_ALLOWED,
+       __U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+       u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+       {
+               /* 2    3    4    5    6    7    8    9   10   11 */
+       /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+       /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+       /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+       /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+       /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+       /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+       }
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+       u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+       {
+               /* 1    2    3    4    5 */
+       /* 1 */ { __I, __D, __D, __D, __U},
+       /* 2 */ { __D, __I, __A, __D, __U},
+       /* 3 */ { __D, __D, __I, __A, __U},
+       /* 4 */ { __D, __D, __I, __I, __U},
+       /* 5 */ { __U, __U, __U, __U, __U},
+       }
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+           new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+               pr_warn("invalid logical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORT_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into logical_state_transitions */
+       old -= IB_PORT_DOWN;
+       new -= IB_PORT_DOWN;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+           new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+               pr_warn("invalid physical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORTPHYSSTATE_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into physical_state_transitions */
+       old -= IB_PORTPHYSSTATE_POLLING;
+       new -= IB_PORTPHYSSTATE_POLLING;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+                                         u32 logical_new, u32 physical_new)
+{
+       u32 physical_old = driver_physical_state(ppd);
+       u32 logical_old = driver_logical_state(ppd);
+       int ret, logical_allowed, physical_allowed;
+
+       ret = logical_transition_allowed(logical_old, logical_new);
+       logical_allowed = ret;
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid logical state transition %s -> %s\n",
+                       opa_lstate_name(logical_old),
+                       opa_lstate_name(logical_new));
+               return ret;
+       }
+
+       ret = physical_transition_allowed(physical_old, physical_new);
+       physical_allowed = ret;
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid physical state transition %s -> %s\n",
+                       opa_pstate_name(physical_old),
+                       opa_pstate_name(physical_new));
+               return ret;
+       }
+
+       if (logical_allowed == HFI_TRANSITION_IGNORED &&
+           physical_allowed == HFI_TRANSITION_IGNORED)
+               return HFI_TRANSITION_IGNORED;
+
+       /*
+        * A change request of Physical Port State from
+        * 'Offline' to 'Polling' should be ignored.
+        */
+       if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
+           (physical_new == IB_PORTPHYSSTATE_POLLING))
+               return HFI_TRANSITION_IGNORED;
+
+       /*
+        * Either physical_allowed or logical_allowed is
+        * HFI_TRANSITION_ALLOWED.
+        */
+       return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+                          u32 logical_state, u32 phys_state,
+                          int suppress_idle_sma)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 link_state;
+       int ret;
+
+       ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               /* error message emitted above */
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return 0;
+       }
+
+       if (ret == HFI_TRANSITION_IGNORED)
+               return 0;
+
+       if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+           !(logical_state == IB_PORT_DOWN ||
+             logical_state == IB_PORT_NOP)){
+               pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+                       logical_state, phys_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       /*
+        * Logical state changes are summarized in OPAv1g1 spec.,
+        * Table 9-12; physical state changes are summarized in
+        * OPAv1g1 spec., Table 6.4.
+        */
+       switch (logical_state) {
+       case IB_PORT_NOP:
+               if (phys_state == IB_PORTPHYSSTATE_NOP)
+                       break;
+               /* FALLTHROUGH */
+       case IB_PORT_DOWN:
+               if (phys_state == IB_PORTPHYSSTATE_NOP) {
+                       link_state = HLS_DN_DOWNDEF;
+               } else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+                       link_state = HLS_DN_POLL;
+                       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
+                                            0, OPA_LINKDOWN_REASON_FM_BOUNCE);
+               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
+                       link_state = HLS_DN_DISABLE;
+               } else {
+                       pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+                               phys_state);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       break;
+               }
+
+               if ((link_state == HLS_DN_POLL ||
+                    link_state == HLS_DN_DOWNDEF)) {
+                       /*
+                        * Going to poll.  No matter what the current state,
+                        * always move offline first, then tune and start the
+                        * link.  This correctly handles a FM link bounce and
+                        * a link enable.  Going offline is a no-op if already
+                        * offline.
+                        */
+                       set_link_state(ppd, HLS_DN_OFFLINE);
+                       tune_serdes(ppd);
+                       start_link(ppd);
+               } else {
+                       set_link_state(ppd, link_state);
+               }
+               if (link_state == HLS_DN_DISABLE &&
+                   (ppd->offline_disabled_reason >
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
+                    ppd->offline_disabled_reason ==
+                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+                       ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+               /*
+                * Don't send a reply if the response would be sent
+                * through the disabled port.
+                */
+               if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               break;
+       case IB_PORT_ARMED:
+               ret = set_link_state(ppd, HLS_UP_ARMED);
+               if ((ret == 0) && (suppress_idle_sma == 0))
+                       send_idle_sma(dd, SMA_IDLE_ARM);
+               break;
+       case IB_PORT_ACTIVE:
+               if (ppd->neighbor_normal) {
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret == 0)
+                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
+               } else {
+                       pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               }
+               break;
+       default:
+               pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+                       logical_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       struct ib_event event;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       u8 clientrereg;
+       unsigned long flags;
+       u32 smlid, opa_lid; /* tmp vars to hold LID values */
+       u16 lid;
+       u8 ls_old, ls_new, ps_new;
+       u8 vls;
+       u8 msl;
+       u8 crc_enabled;
+       u16 lse, lwe, mtu;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       int ret, i, invalid = 0, call_set_mtu = 0;
+       int call_link_downgrade_policy = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       opa_lid = be32_to_cpu(pi->lid);
+       if (opa_lid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+
+       lid = (u16)(opa_lid & 0x0000FFFF);
+
+       smlid = be32_to_cpu(pi->sm_lid);
+       if (smlid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+       smlid &= 0x0000FFFF;
+
+       clientrereg = (pi->clientrereg_subnettimeout &
+                       OPA_PI_MASK_CLIENT_REREGISTER);
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+       event.device = ibdev;
+       event.element.port_num = port;
+
+       ls_old = driver_lstate(ppd);
+
+       ibp->rvp.mkey = pi->mkey;
+       ibp->rvp.gid_prefix = pi->subnet_prefix;
+       ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+       /* Must be a valid unicast LID address. */
+       if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+           lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+                       lid);
+       } else if (ppd->lid != lid ||
+                ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+               if (ppd->lid != lid)
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+               if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+               hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       msl = pi->smsl & OPA_PI_MASK_SMSL;
+       if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+               ppd->linkinit_reason =
+                       (pi->partenforce_filterraw &
+                        OPA_PI_MASK_LINKINIT_REASON);
+       /* enable/disable SW pkey checking as per FM control */
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+       /* Must be a valid unicast LID address. */
+       if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+           smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+       } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
+               pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+               spin_lock_irqsave(&ibp->rvp.lock, flags);
+               if (ibp->rvp.sm_ah) {
+                       if (smlid != ibp->rvp.sm_lid)
+                               ibp->rvp.sm_ah->attr.dlid = smlid;
+                       if (msl != ibp->rvp.sm_sl)
+                               ibp->rvp.sm_ah->attr.sl = msl;
+               }
+               spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+               if (smlid != ibp->rvp.sm_lid)
+                       ibp->rvp.sm_lid = smlid;
+               if (msl != ibp->rvp.sm_sl)
+                       ibp->rvp.sm_sl = msl;
+               event.event = IB_EVENT_SM_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       if (pi->link_down_reason == 0) {
+               ppd->local_link_down_reason.sma = 0;
+               ppd->local_link_down_reason.latest = 0;
+       }
+
+       if (pi->neigh_link_down_reason == 0) {
+               ppd->neigh_link_down_reason.sma = 0;
+               ppd->neigh_link_down_reason.latest = 0;
+       }
+
+       ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+       ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+       ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+       lwe = be16_to_cpu(pi->link_width.enabled);
+       if (lwe) {
+               if (lwe == OPA_LINK_WIDTH_RESET ||
+                   lwe == OPA_LINK_WIDTH_RESET_OLD)
+                       set_link_width_enabled(ppd, ppd->link_width_supported);
+               else if ((lwe & ~ppd->link_width_supported) == 0)
+                       set_link_width_enabled(ppd, lwe);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+       /* LWD.E is always applied - 0 means "disabled" */
+       if (lwe == OPA_LINK_WIDTH_RESET ||
+           lwe == OPA_LINK_WIDTH_RESET_OLD) {
+               set_link_width_downgrade_enabled(ppd,
+                                                ppd->
+                                                link_width_downgrade_supported
+                                                );
+       } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+               /* only set and apply if something changed */
+               if (lwe != ppd->link_width_downgrade_enabled) {
+                       set_link_width_downgrade_enabled(ppd, lwe);
+                       call_link_downgrade_policy = 1;
+               }
+       } else {
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       lse = be16_to_cpu(pi->link_speed.enabled);
+       if (lse) {
+               if (lse & be16_to_cpu(pi->link_speed.supported))
+                       set_link_speed_enabled(ppd, lse);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       ibp->rvp.mkeyprot =
+               (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+       ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+                                   ibp->rvp.vl_high_limit);
+
+       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if ((i % 2) == 0)
+                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
+                                          4) & 0xF);
+               else
+                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
+                                         0xF);
+               if (mtu == 0xffff) {
+                       pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+                               mtu,
+                               (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       mtu = hfi1_max_mtu; /* use a valid MTU */
+               }
+               if (dd->vld[i].mtu != mtu) {
+                       dd_dev_info(dd,
+                                   "MTU change on vl %d from %d to %d\n",
+                                   i, dd->vld[i].mtu, mtu);
+                       dd->vld[i].mtu = mtu;
+                       call_set_mtu++;
+               }
+       }
+       /* As per OPAV1 spec: VL15 must support and be configured
+        * for operation with a 2048 or larger MTU.
+        */
+       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
+       if (mtu < 2048 || mtu == 0xffff)
+               mtu = 2048;
+       if (dd->vld[15].mtu != mtu) {
+               dd_dev_info(dd,
+                           "MTU change on vl 15 from %d to %d\n",
+                           dd->vld[15].mtu, mtu);
+               dd->vld[15].mtu = mtu;
+               call_set_mtu++;
+       }
+       if (call_set_mtu)
+               set_mtu(ppd);
+
+       /* Set operational VLs */
+       vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+       if (vls) {
+               if (vls > ppd->vls_supported) {
+                       pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+                               pi->operational_vls);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               } else {
+                       if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+                                           vls) == -EINVAL)
+                               smp->status |= IB_SMP_INVALID_FIELD;
+               }
+       }
+
+       if (pi->mkey_violations == 0)
+               ibp->rvp.mkey_violations = 0;
+
+       if (pi->pkey_violations == 0)
+               ibp->rvp.pkey_violations = 0;
+
+       if (pi->qkey_violations == 0)
+               ibp->rvp.qkey_violations = 0;
+
+       ibp->rvp.subnet_timeout =
+               pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+       crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+       crc_enabled >>= 4;
+       crc_enabled &= 0xf;
+
+       if (crc_enabled != 0)
+               ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+       ppd->is_active_optimize_enabled =
+                       !!(be16_to_cpu(pi->port_mode)
+                                       & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+       ls_new = pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_STATE;
+       ps_new = (pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+       if (clientrereg) {
+               event.event = IB_EVENT_CLIENT_REREGISTER;
+               ib_dispatch_event(&event);
+       }
+
+       /*
+        * Do the port state change now that the other link parameters
+        * have been set.
+        * Changing the port physical state only makes sense if the link
+        * is down or is being set to down.
+        */
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+       /* restore re-reg bit per o14-12.2.1 */
+       pi->clientrereg_subnettimeout |= clientrereg;
+
+       /*
+        * Apply the new link downgrade policy.  This may result in a link
+        * bounce.  Do this after everything else so things are settled.
+        * Possible problem: if setting the port state above fails, then
+        * the policy change is not applied.
+        */
+       if (call_link_downgrade_policy)
+               apply_link_downgrade_policy(ppd, 0);
+
+       return ret;
+
+get_only:
+       return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+       int changed = 0;
+       int update_includes_mgmt_partition = 0;
+
+       /*
+        * IB port one/two always maps to context zero/one,
+        * always a kernel context, no locking needed
+        * If we get here with ppd setup, no need to check
+        * that rcd is valid.
+        */
+       ppd = dd->pport + (port - 1);
+       /*
+        * If the update does not include the management pkey, don't do it.
+        */
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               if (pkeys[i] == LIM_MGMT_P_KEY) {
+                       update_includes_mgmt_partition = 1;
+                       break;
+               }
+       }
+
+       if (!update_includes_mgmt_partition)
+               return 1;
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               u16 key = pkeys[i];
+               u16 okey = ppd->pkeys[i];
+
+               if (key == okey)
+                       continue;
+               /*
+                * Don't update pkeys[2], if an HFI port without MgmtAllowed
+                * by neighbor is a switch.
+                */
+               if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1)
+                       continue;
+               /*
+                * The SM gives us the complete PKey table. We have
+                * to ensure that we put the PKeys in the matching
+                * slots.
+                */
+               ppd->pkeys[i] = key;
+               changed = 1;
+       }
+
+       if (changed) {
+               struct ib_event event;
+
+               (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+
+               event.event = IB_EVENT_PKEY_CHANGE;
+               event.device = &dd->verbs_dev.rdi.ibdev;
+               event.element.port_num = port;
+               ib_dispatch_event(&event);
+       }
+       return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_sent = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       u16 *p = (u16 *)data;
+       __be16 *q = (__be16 *)data;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+
+       if (n_blocks_sent == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_sent);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       if (start_block + n_blocks_sent > n_blocks_avail ||
+           n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_sent, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+               p[i] = be16_to_cpu(q[i]);
+
+       if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = data;
+
+       *val++ = read_csr(dd, SEND_SC2VLT0);
+       *val++ = read_csr(dd, SEND_SC2VLT1);
+       *val++ = read_csr(dd, SEND_SC2VLT2);
+       *val++ = read_csr(dd, SEND_SC2VLT3);
+       return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+       int i;
+       u8 *pd = data;
+
+       for (i = 0; i < OPA_MAX_SCS; i++) {
+               if (i == 15)
+                       continue;
+               if ((pd[i] & 0x1f) == 0xf)
+                       pd[i] = ILLEGAL_VL;
+       }
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = data;
+
+       filter_sc2vlt(data);
+
+       write_csr(dd, SEND_SC2VLT0, *val++);
+       write_csr(dd, SEND_SC2VLT1, *val++);
+       write_csr(dd, SEND_SC2VLT2, *val++);
+       write_csr(dd, SEND_SC2VLT3, *val++);
+       write_seqlock_irq(&dd->sc2vl_lock);
+       memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
+       write_sequnlock_irq(&dd->sc2vl_lock);
+       return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+               *p++ = ibp->sl_to_sc[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       int i;
+       u8 sc;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
+               sc = *p++;
+               if (ibp->sl_to_sc[i] != sc) {
+                       ibp->sl_to_sc[i] = sc;
+
+                       /* Put all stale qps into error state */
+                       hfi1_error_port_qps(ibp, i);
+               }
+       }
+
+       return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               *p++ = ibp->sc_to_sl[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = data;
+       int i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               ibp->sc_to_sl[i] = *p++;
+
+       return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *)data;
+       size_t size = 4 * sizeof(u64);
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       get_sc2vlt_tables(dd, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       int async_update = OPA_AM_ASYNC(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *)data;
+       struct hfi1_pportdata *ppd;
+       int lstate;
+
+       if (n_blocks != 1 || async_update) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       /*
+        * it's known that async_update is 0 by this point, but include
+        * the explicit check for clarity
+        */
+       if (!async_update &&
+           (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       set_sc2vlt_tables(dd, vp);
+
+       return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *)data;
+       int size;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *)data;
+       int lstate;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                        resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 lstate;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       lstate = driver_lstate(ppd);
+
+       if (start_of_sm_config && (lstate == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+#if PI_LED_ENABLE_SUP
+       psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason;
+#else
+       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
+#endif /* PI_LED_ENABLE_SUP */
+
+       psi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+       psi->link_width_downgrade_tx_active =
+               cpu_to_be16(ppd->link_width_downgrade_tx_active);
+       psi->link_width_downgrade_rx_active =
+               cpu_to_be16(ppd->link_width_downgrade_rx_active);
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_state_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 ls_old;
+       u8 ls_new, ps_new;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+       int ret, invalid = 0;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       ls_old = driver_lstate(ppd);
+
+       ls_new = port_states_to_logical_state(&psi->port_states);
+       ps_new = port_states_to_phys_state(&psi->port_states);
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       if (invalid)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 addr = OPA_AM_CI_ADDR(am);
+       u32 len = OPA_AM_CI_LEN(am) + 1;
+       int ret;
+
+#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+       /*
+        * check that addr is within spec, and
+        * addr and (addr + len - 1) are on the same "page"
+        */
+       if (addr >= 4096 ||
+           (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ret = get_cable_info(dd, port, addr, len, data);
+
+       if (ret == -ENODEV) {
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* The address range for the CableInfo SMA query is wider than the
+        * memory available on the QSFP cable. We want to return a valid
+        * response, albeit zeroed out, for address ranges beyond available
+        * memory but that are within the CableInfo query spec
+        */
+       if (ret < 0 && ret != -ERANGE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       if (resp_len)
+               *resp_len += len;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *)data;
+       int size;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+       size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+       trace_bct_get(dd, p);
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *)data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       ppd = dd->pport + (port - 1);
+       trace_bct_set(dd, p);
+       if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+       int size = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+               break;
+       case OPA_VLARB_PREEMPT_MATRIX:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+               break;
+       default:
+               pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       if (size > 0 && resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       /*
+        * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+        * can be changed from the default values
+        */
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               /* FALLTHROUGH */
+       case OPA_VLARB_PREEMPT_MATRIX:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               break;
+       default:
+               pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+       struct ib_mad_hdr mad_hdr;
+       u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+       u8 base_version;
+       u8 class_version;
+       __be16 cap_mask;
+       __be32 cap_mask2_resp_time;
+
+       u8 redirect_gid[16];
+       __be32 redirect_tc_fl;
+       __be32 redirect_lid;
+       __be32 redirect_sl_qp;
+       __be32 redirect_qkey;
+
+       u8 trap_gid[16];
+       __be32 trap_tc_fl;
+       __be32 trap_lid;
+       __be32 trap_hl_qp;
+       __be32 trap_qkey;
+
+       __be16 trap_pkey;
+       __be16 redirect_pkey;
+
+       u8 trap_sl_rsvd;
+       u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL            0x000080ff
+
+struct opa_port_status_rsp {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32  vl_select_mask;
+
+       /* Data counters */
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_pkts;
+       __be64 port_rcv_pkts;
+       __be64 port_multicast_xmit_pkts;
+       __be64 port_multicast_rcv_pkts;
+       __be64 port_xmit_wait;
+       __be64 sw_port_congestion;
+       __be64 port_rcv_fecn;
+       __be64 port_rcv_becn;
+       __be64 port_xmit_time_cong;
+       __be64 port_xmit_wasted_bw;
+       __be64 port_xmit_wait_data;
+       __be64 port_rcv_bubble;
+       __be64 port_mark_fecn;
+       /* Error counters */
+       __be64 port_rcv_constraint_errors;
+       __be64 port_rcv_switch_relay_errors;
+       __be64 port_xmit_discards;
+       __be64 port_xmit_constraint_errors;
+       __be64 port_rcv_remote_physical_errors;
+       __be64 local_link_integrity_errors;
+       __be64 port_rcv_errors;
+       __be64 excessive_buffer_overruns;
+       __be64 fm_config_errors;
+       __be32 link_error_recovery;
+       __be32 link_downed;
+       u8 uncorrectable_errors;
+
+       u8 link_quality_indicator; /* 5res, 3bit */
+       u8 res2[6];
+       struct _vls_pctrs {
+               /* per-VL Data counters */
+               __be64 port_vl_xmit_data;
+               __be64 port_vl_rcv_data;
+               __be64 port_vl_xmit_pkts;
+               __be64 port_vl_rcv_pkts;
+               __be64 port_vl_xmit_wait;
+               __be64 sw_port_vl_congestion;
+               __be64 port_vl_rcv_fecn;
+               __be64 port_vl_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_vl_xmit_wasted_bw;
+               __be64 port_vl_xmit_wait_data;
+               __be64 port_vl_rcv_bubble;
+               __be64 port_vl_mark_fecn;
+               __be64 port_vl_xmit_discards;
+       } vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+       CS_PORT_XMIT_DATA                       = (1 << 31),
+       CS_PORT_RCV_DATA                        = (1 << 30),
+       CS_PORT_XMIT_PKTS                       = (1 << 29),
+       CS_PORT_RCV_PKTS                        = (1 << 28),
+       CS_PORT_MCAST_XMIT_PKTS                 = (1 << 27),
+       CS_PORT_MCAST_RCV_PKTS                  = (1 << 26),
+       CS_PORT_XMIT_WAIT                       = (1 << 25),
+       CS_SW_PORT_CONGESTION                   = (1 << 24),
+       CS_PORT_RCV_FECN                        = (1 << 23),
+       CS_PORT_RCV_BECN                        = (1 << 22),
+       CS_PORT_XMIT_TIME_CONG                  = (1 << 21),
+       CS_PORT_XMIT_WASTED_BW                  = (1 << 20),
+       CS_PORT_XMIT_WAIT_DATA                  = (1 << 19),
+       CS_PORT_RCV_BUBBLE                      = (1 << 18),
+       CS_PORT_MARK_FECN                       = (1 << 17),
+       CS_PORT_RCV_CONSTRAINT_ERRORS           = (1 << 16),
+       CS_PORT_RCV_SWITCH_RELAY_ERRORS         = (1 << 15),
+       CS_PORT_XMIT_DISCARDS                   = (1 << 14),
+       CS_PORT_XMIT_CONSTRAINT_ERRORS          = (1 << 13),
+       CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS      = (1 << 12),
+       CS_LOCAL_LINK_INTEGRITY_ERRORS          = (1 << 11),
+       CS_PORT_RCV_ERRORS                      = (1 << 10),
+       CS_EXCESSIVE_BUFFER_OVERRUNS            = (1 << 9),
+       CS_FM_CONFIG_ERRORS                     = (1 << 8),
+       CS_LINK_ERROR_RECOVERY                  = (1 << 7),
+       CS_LINK_DOWNED                          = (1 << 6),
+       CS_UNCORRECTABLE_ERRORS                 = (1 << 5),
+};
+
+struct opa_clear_port_status {
+       __be64 port_select_mask[4];
+       __be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+       __be16 attr_id;
+       __be16 err_reqlength;   /* 1 bit, 8 res, 7 bit */
+       __be32 attr_mod;
+       u8 data[0];
+};
+
+#define MSK_LLI 0x000000f0
+#define MSK_LLI_SFT 4
+#define MSK_LER 0x0000000f
+#define MSK_LER_SFT 0
+#define ADD_LLI 8
+#define ADD_LER 2
+
+/* Request contains first three fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+       __be32 resolution;
+
+       /* Response fields follow */
+       struct _port_dctrs {
+               u8 port_number;
+               u8 reserved2[3];
+               __be32 link_quality_indicator; /* 29res, 3bit */
+
+               /* Data counters */
+               __be64 port_xmit_data;
+               __be64 port_rcv_data;
+               __be64 port_xmit_pkts;
+               __be64 port_rcv_pkts;
+               __be64 port_multicast_xmit_pkts;
+               __be64 port_multicast_rcv_pkts;
+               __be64 port_xmit_wait;
+               __be64 sw_port_congestion;
+               __be64 port_rcv_fecn;
+               __be64 port_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_xmit_wasted_bw;
+               __be64 port_xmit_wait_data;
+               __be64 port_rcv_bubble;
+               __be64 port_mark_fecn;
+
+               __be64 port_error_counter_summary;
+               /* Sum of error counts/port */
+
+               struct _vls_dctrs {
+                       /* per-VL Data counters */
+                       __be64 port_vl_xmit_data;
+                       __be64 port_vl_rcv_data;
+                       __be64 port_vl_xmit_pkts;
+                       __be64 port_vl_rcv_pkts;
+                       __be64 port_vl_xmit_wait;
+                       __be64 sw_port_vl_congestion;
+                       __be64 port_vl_rcv_fecn;
+                       __be64 port_vl_rcv_becn;
+                       __be64 port_xmit_time_cong;
+                       __be64 port_vl_xmit_wasted_bw;
+                       __be64 port_vl_xmit_wait_data;
+                       __be64 port_vl_rcv_bubble;
+                       __be64 port_vl_mark_fecn;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask*/
+       } port[1]; /* array size defined by  #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+       /*
+        * Request contains first two fields, response contains the
+        * whole magilla
+        */
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+
+       /* Response-only fields follow */
+       __be32 reserved1;
+       struct _port_ectrs {
+               u8 port_number;
+               u8 reserved2[7];
+               __be64 port_rcv_constraint_errors;
+               __be64 port_rcv_switch_relay_errors;
+               __be64 port_xmit_discards;
+               __be64 port_xmit_constraint_errors;
+               __be64 port_rcv_remote_physical_errors;
+               __be64 local_link_integrity_errors;
+               __be64 port_rcv_errors;
+               __be64 excessive_buffer_overruns;
+               __be64 fm_config_errors;
+               __be32 link_error_recovery;
+               __be32 link_downed;
+               u8 uncorrectable_errors;
+               u8 reserved3[7];
+               struct _vls_ectrs {
+                       __be64 port_vl_xmit_discards;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask */
+       } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+       __be64 port_select_mask[4];
+       __be32 error_info_select_mask;
+       __be32 reserved1;
+       struct _port_ei {
+               u8 port_number;
+               u8 reserved2[7];
+
+               /* PortRcvErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       union {
+                               u8 raw[17];
+                               struct {
+                                       /* EI1to12 format */
+                                       u8 packet_flit1[8];
+                                       u8 packet_flit2[8];
+                                       u8 remaining_flit_bits12;
+                               } ei1to12;
+                               struct {
+                                       u8 packet_bytes[8];
+                                       u8 remaining_flit_bits;
+                               } ei13;
+                       } ei;
+                       u8 reserved3[6];
+               } __packed port_rcv_ei;
+
+               /* ExcessiveBufferOverrunInfo */
+               struct {
+                       u8 status_and_sc;
+                       u8 reserved4[7];
+               } __packed excessive_buffer_overrun_ei;
+
+               /* PortXmitConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved5;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_xmit_constraint_ei;
+
+               /* PortRcvConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved6;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_rcv_constraint_ei;
+
+               /* PortRcvSwitchRelayErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved7[3];
+                       __u32 error_info;
+               } __packed port_rcv_switch_relay_ei;
+
+               /* UncorrectableErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved8;
+               } __packed uncorrectable_ei;
+
+               /* FMConfigErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 error_info;
+               } __packed fm_config_ei;
+               __u32 reserved9;
+       } port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+       ES_PORT_RCV_ERROR_INFO                  = (1 << 31),
+       ES_EXCESSIVE_BUFFER_OVERRUN_INFO        = (1 << 30),
+       ES_PORT_XMIT_CONSTRAINT_ERROR_INFO      = (1 << 29),
+       ES_PORT_RCV_CONSTRAINT_ERROR_INFO       = (1 << 28),
+       ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO     = (1 << 27),
+       ES_UNCORRECTABLE_ERROR_INFO             = (1 << 26),
+       ES_FM_CONFIG_ERROR_INFO                 = (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+                                    struct ib_device *ibdev, u32 *resp_len)
+{
+       struct opa_class_port_info *p =
+               (struct opa_class_port_info *)pmp->data;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       if (pmp->mad_hdr.attr_mod != 0)
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       p->base_version = OPA_MGMT_BASE_VERSION;
+       p->class_version = OPA_SMI_CLASS_VERSION;
+       /*
+        * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+        */
+       p->cap_mask2_resp_time = cpu_to_be32(18);
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+                         struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+       if (!is_bx(ppd->dd)) {
+               unsigned long vl;
+               u64 sum_vl_xmit_wait = 0;
+               u32 vl_all_mask = VL_MASK_ALL;
+
+               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+                                8 * sizeof(vl_all_mask)) {
+                       u64 tmp = sum_vl_xmit_wait +
+                                 read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                idx_from_vl(vl));
+                       if (tmp < sum_vl_xmit_wait) {
+                               /* we wrapped */
+                               sum_vl_xmit_wait = (u64)~0;
+                               break;
+                       }
+                       sum_vl_xmit_wait = tmp;
+               }
+               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+       }
+}
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       struct opa_port_status_req *req =
+               (struct opa_port_status_req *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_port_status_rsp *rsp;
+       u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       unsigned long vl;
+       size_t response_data_size;
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u8 port_num = req->port_num;
+       u8 num_vls = hweight32(vl_select_mask);
+       struct _vls_pctrs *vlinfo;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       int vfi;
+       u64 tmp, tmp2;
+
+       response_data_size = sizeof(struct opa_port_status_rsp) +
+                               num_vls * sizeof(struct _vls_pctrs);
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       if (nports != 1 || (port_num && port_num != port) ||
+           num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       rsp = (struct opa_port_status_rsp *)pmp->data;
+       if (port_num)
+               rsp->port_num = port_num;
+       else
+               rsp->port_num = port;
+
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+
+       hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+       rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                        CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                        CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                  CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                                     CNTR_INVALID_VL));
+
+       /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+               rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                                 idx_from_vl(vl)));
+
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_portstatus(ppd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
+                                    u8 res_lli, u8 res_ler)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 error_counter_summary = 0, tmp;
+
+       error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       /* port_rcv_switch_relay_errors is 0 for HFIs */
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                              CNTR_INVALID_VL);
+       /* local link integrity must be right-shifted by the lli resolution */
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       error_counter_summary += (tmp >> res_lli);
+       /* link error recovery must b right-shifted by the ler resolution */
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
+       error_counter_summary += (tmp >> res_ler);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+                                              CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                              CNTR_INVALID_VL);
+       /* ppd->link_downed is a 32-bit value */
+       error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL);
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       /* this is an 8-bit quantity */
+       error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
+                           u32 vl_select_mask)
+{
+       if (!is_bx(ppd->dd)) {
+               unsigned long vl;
+               u64 sum_vl_xmit_wait = 0;
+               u32 vl_all_mask = VL_MASK_ALL;
+
+               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+                                8 * sizeof(vl_all_mask)) {
+                       u64 tmp = sum_vl_xmit_wait +
+                                 read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                idx_from_vl(vl));
+                       if (tmp < sum_vl_xmit_wait) {
+                               /* we wrapped */
+                               sum_vl_xmit_wait = (u64)~0;
+                               break;
+                       }
+                       sum_vl_xmit_wait = tmp;
+               }
+               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+       }
+}
+
+static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
+                                  struct _port_dctrs *rsp)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+                                   struct ib_device *ibdev,
+                                   u8 port, u32 *resp_len)
+{
+       struct opa_port_data_counters_msg *req =
+               (struct opa_port_data_counters_msg *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct _port_dctrs *rsp;
+       struct _vls_dctrs *vlinfo;
+       size_t response_data_size;
+       u32 num_ports;
+       u8 num_pslm;
+       u8 lq, num_vls;
+       u8 res_lli, res_ler;
+       u64 port_mask;
+       unsigned long port_num;
+       unsigned long vl;
+       u32 vl_select_mask;
+       int vfi;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
+       res_lli = res_lli ? res_lli + ADD_LLI : 0;
+       res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
+       res_ler = res_ler ? res_ler + ADD_LER : 0;
+
+       if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_data_counters_msg) +
+                               num_vls * sizeof(struct _vls_dctrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = &req->port[0];
+       memset(rsp, 0, sizeof(*rsp));
+
+       rsp->port_number = port;
+       /*
+        * Note that link_quality_indicator is a 32 bit quantity in
+        * 'datacounters' queries (as opposed to 'portinfo' queries,
+        * where it's a byte).
+        */
+       hfi1_read_link_quality(dd, &lq);
+       rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+       pma_get_opa_port_dctrs(ibdev, rsp);
+
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+       rsp->port_error_counter_summary =
+               cpu_to_be64(get_error_counter_summary(ibdev, port,
+                                                     res_lli, res_ler));
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_data =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                                 idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                  idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                                 idx_from_vl(vl)));
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                                 idx_from_vl(vl)));
+
+               /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+               /* rsp->port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+                * does this differ from rsp->vls[vfi].port_vl_xmit_wait
+                */
+               /*rsp->vls[vfi].port_vl_mark_fecn =
+                *      cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+                *              + offset));
+                */
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_datacounters(ppd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
+                                      struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
+                                               pmp->data;
+       struct _port_dctrs rsp;
+
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_dctrs(ibdev, &rsp);
+
+       p->port_xmit_data = rsp.port_xmit_data;
+       p->port_rcv_data = rsp.port_rcv_data;
+       p->port_xmit_packets = rsp.port_xmit_pkts;
+       p->port_rcv_packets = rsp.port_rcv_pkts;
+       p->port_unicast_xmit_packets = 0;
+       p->port_unicast_rcv_packets =  0;
+       p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
+       p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
+
+bail:
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
+                                  struct _port_ectrs *rsp, u8 port)
+{
+       u64 tmp, tmp2;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                       CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_switch_relay_errors = 0;
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ectrs *rsp;
+       u8 port_num;
+       struct opa_port_error_counters64_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 num_ports;
+       u8 num_pslm;
+       u8 num_vls;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct _vls_ectrs *vlinfo;
+       unsigned long vl;
+       u64 port_mask, tmp;
+       u32 vl_select_mask;
+       int vfi;
+
+       req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+                               num_vls * sizeof(struct _vls_ectrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = &req->port[0];
+
+       ibp = to_iport(ibdev, port_num);
+       ppd = ppd_from_ibp(ibp);
+
+       memset(rsp, 0, sizeof(*rsp));
+       rsp->port_number = port_num;
+
+       pma_get_opa_port_ectrs(ibdev, rsp, port_num);
+
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                         CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = &rsp->vls[0];
+       vfi = 0;
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+               vlinfo += 1;
+               vfi++;
+       }
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
+                                  struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct _port_ectrs rsp;
+       u64 temp_link_overrun_errors;
+       u64 temp_64;
+       u32 temp_32;
+
+       memset(&rsp, 0, sizeof(rsp));
+       pma_get_opa_port_ectrs(ibdev, &rsp, port);
+
+       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               goto bail;
+       }
+
+       p->symbol_error_counter = 0; /* N/A for OPA */
+
+       temp_32 = be32_to_cpu(rsp.link_error_recovery);
+       if (temp_32 > 0xFFUL)
+               p->link_error_recovery_counter = 0xFF;
+       else
+               p->link_error_recovery_counter = (u8)temp_32;
+
+       temp_32 = be32_to_cpu(rsp.link_downed);
+       if (temp_32 > 0xFFUL)
+               p->link_downed_counter = 0xFF;
+       else
+               p->link_downed_counter = (u8)temp_32;
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
+       if (temp_64 > 0xFFFFUL)
+               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
+       p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_xmit_discards);
+       if (temp_64 > 0xFFFFUL)
+               p->port_xmit_discards = cpu_to_be16(0xFFFF);
+       else
+               p->port_xmit_discards = cpu_to_be16((u16)temp_64);
+
+       temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_xmit_constraint_errors = 0xFF;
+       else
+               p->port_xmit_constraint_errors = (u8)temp_64;
+
+       temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
+       if (temp_64 > 0xFFUL)
+               p->port_rcv_constraint_errors = 0xFFUL;
+       else
+               p->port_rcv_constraint_errors = (u8)temp_64;
+
+       /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+
+       temp_link_overrun_errors = temp_64 << 4;
+
+       temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
+       if (temp_64 > 0xFUL)
+               temp_64 = 0xFUL;
+       temp_link_overrun_errors |= temp_64;
+
+       p->link_overrun_errors = (u8)temp_link_overrun_errors;
+
+       p->vl15_dropped = 0; /* N/A for OPA */
+
+bail:
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       u8 port_num;
+       u8 num_pslm;
+       u64 reg;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = &req->port[0];
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_error_info_msg);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* PortRcvErrorInfo */
+       rsp->port_rcv_ei.status_and_code =
+               dd->err_info_rcvport.status_and_code;
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+              &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+              &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+       /* ExcessiverBufferOverrunInfo */
+       reg = read_csr(dd, RCV_ERR_INFO);
+       if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+               /*
+                * if the RcvExcessBufferOverrun bit is set, save SC of
+                * first pkt that encountered an excess buffer overrun
+                */
+               u8 tmp = (u8)reg;
+
+               tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+               tmp <<= 2;
+               rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+               /* set the status bit */
+               rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+       }
+
+       rsp->port_xmit_constraint_ei.status =
+               dd->err_info_xmit_constraint.status;
+       rsp->port_xmit_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+       rsp->port_xmit_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+       rsp->port_rcv_constraint_ei.status =
+               dd->err_info_rcv_constraint.status;
+       rsp->port_rcv_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+       rsp->port_rcv_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+       /* UncorrectableErrorInfo */
+       rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+       /* FMConfigErrorInfo */
+       rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+                                 struct ib_device *ibdev,
+                                 u8 port, u32 *resp_len)
+{
+       struct opa_clear_port_status *req =
+               (struct opa_clear_port_status *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u64 portn = be64_to_cpu(req->port_select_mask[3]);
+       u32 counter_select = be32_to_cpu(req->counter_select_mask);
+       u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+       unsigned long vl;
+
+       if ((nports != 1) || (portn != 1 << port)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * only counters returned by pma_get_opa_portstatus() are
+        * handled, so when pma_get_opa_portstatus() gets a fix,
+        * the corresponding change should be made here as well.
+        */
+
+       if (counter_select & CS_PORT_XMIT_DATA)
+               write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_DATA)
+               write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_WAIT)
+               write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_sw_portCongestion for HFIs */
+
+       if (counter_select & CS_PORT_RCV_FECN)
+               write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_BECN)
+               write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_xmit_time_cong for HFIs */
+       /* ignore cs_port_xmit_wasted_bw for now */
+       /* ignore cs_port_xmit_wait_data for now */
+       if (counter_select & CS_PORT_RCV_BUBBLE)
+               write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+       /* Only applicable for switch */
+       /* if (counter_select & CS_PORT_MARK_FECN)
+        *      write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
+        */
+
+       if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_rcv_switch_relay_errors for HFIs */
+       if (counter_select & CS_PORT_XMIT_DISCARDS)
+               write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+               write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
+               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_LINK_ERROR_RECOVERY) {
+               write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                              CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_PORT_RCV_ERRORS)
+               write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+               write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+               dd->rcv_ovfl_cnt = 0;
+       }
+
+       if (counter_select & CS_FM_CONFIG_ERRORS)
+               write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LINK_DOWNED)
+               write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_UNCORRECTABLE_ERRORS)
+               write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+               if (counter_select & CS_PORT_XMIT_DATA)
+                       write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_DATA)
+                       write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_PKTS)
+                       write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_PKTS)
+                       write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_WAIT)
+                       write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+               /* sw_port_vl_congestion is 0 for HFIs */
+               if (counter_select & CS_PORT_RCV_FECN)
+                       write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_BECN)
+                       write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+               /* port_vl_xmit_time_cong is 0 for HFIs */
+               /* port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+               if (counter_select & CS_PORT_RCV_BUBBLE)
+                       write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+               /* if (counter_select & CS_PORT_MARK_FECN)
+                *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+                */
+               /* port_vl_xmit_discards ??? */
+       }
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+                                struct ib_device *ibdev,
+                                u8 port, u32 *resp_len)
+{
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       u8 port_num;
+       u8 num_pslm;
+       u32 error_info_select;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = &req->port[0];
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if (port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+       /* PortRcvErrorInfo */
+       if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+       /* ExcessiverBufferOverrunInfo */
+       if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+               /*
+                * status bit is essentially kept in the h/w - bit 5 of
+                * RCV_ERR_INFO
+                */
+               write_csr(dd, RCV_ERR_INFO,
+                         RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+       if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+               dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+               dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       /* UncorrectableErrorInfo */
+       if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+       /* FMConfigErrorInfo */
+       if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+       __be16 congestion_info;
+       u8 control_table_cap;   /* Multiple of 64 entry unit CCTs */
+       u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct opa_congestion_info_attr *p =
+               (struct opa_congestion_info_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       p->congestion_info = 0;
+       p->control_table_cap = ppd->cc_max_table_entries;
+       p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+                                      u8 *data, struct ib_device *ibdev,
+                                      u8 port, u32 *resp_len)
+{
+       int i;
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       struct cc_state *cc_state;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       entries = cc_state->cong_setting.entries;
+       p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+       p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               p->entries[i].ccti_increase = entries[i].ccti_increase;
+               p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+               p->entries[i].trigger_threshold =
+                       entries[i].trigger_threshold;
+               p->entries[i].ccti_min = entries[i].ccti_min;
+       }
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * Apply congestion control information stored in the ppd to the
+ * active structure.
+ */
+static void apply_cc_state(struct hfi1_pportdata *ppd)
+{
+       struct cc_state *old_cc_state, *new_cc_state;
+
+       new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+       if (!new_cc_state)
+               return;
+
+       /*
+        * Hold the lock for updating *and* to prevent ppd information
+        * from changing during the update.
+        */
+       spin_lock(&ppd->cc_state_lock);
+
+       old_cc_state = get_cc_state(ppd);
+       if (!old_cc_state) {
+               /* never active, or shutting down */
+               spin_unlock(&ppd->cc_state_lock);
+               kfree(new_cc_state);
+               return;
+       }
+
+       *new_cc_state = *old_cc_state;
+
+       new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1;
+       memcpy(new_cc_state->cct.entries, ppd->ccti_entries,
+              ppd->total_cct_entry * sizeof(struct ib_cc_table_entry));
+
+       new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+       new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+       memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+              OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+       rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+       spin_unlock(&ppd->cc_state_lock);
+
+       call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+                                      struct ib_device *ibdev, u8 port,
+                                      u32 *resp_len)
+{
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       int i;
+
+       /*
+        * Save details from packet into the ppd.  Hold the cc_state_lock so
+        * our information is consistent with anyone trying to apply the state.
+        */
+       spin_lock(&ppd->cc_state_lock);
+       ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+       entries = ppd->congestion_entries;
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               entries[i].ccti_increase = p->entries[i].ccti_increase;
+               entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+               entries[i].trigger_threshold =
+                       p->entries[i].trigger_threshold;
+               entries[i].ccti_min = p->entries[i].ccti_min;
+       }
+       spin_unlock(&ppd->cc_state_lock);
+
+       /* now apply the information */
+       apply_cc_state(ppd);
+
+       return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+                                          resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+                                       u8 *data, struct ib_device *ibdev,
+                                       u8 port, u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+       s64 ts;
+       int i;
+
+       if (am != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       spin_lock_irq(&ppd->cc_log_lock);
+
+       cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+       cong_log->congestion_flags = 0;
+       cong_log->threshold_event_counter =
+               cpu_to_be16(ppd->threshold_event_counter);
+       memcpy(cong_log->threshold_cong_event_map,
+              ppd->threshold_cong_event_map,
+              sizeof(cong_log->threshold_cong_event_map));
+       /* keep timestamp in units of 1.024 usec */
+       ts = ktime_to_ns(ktime_get()) / 1024;
+       cong_log->current_time_stamp = cpu_to_be32(ts);
+       for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+               struct opa_hfi1_cong_log_event_internal *cce =
+                       &ppd->cc_events[ppd->cc_mad_idx++];
+               if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+                       ppd->cc_mad_idx = 0;
+               /*
+                * Entries which are older than twice the time
+                * required to wrap the counter are supposed to
+                * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+                */
+               if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+                       continue;
+               memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+               memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+                      &cce->rqpn, 3);
+               cong_log->events[i].sl_svc_type_cn_entry =
+                       ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+               cong_log->events[i].remote_lid_cn_entry =
+                       cpu_to_be32(cce->rlid);
+               cong_log->events[i].timestamp_cn_entry =
+                       cpu_to_be32(cce->timestamp);
+       }
+
+       /*
+        * Reset threshold_cong_event_map, and threshold_event_counter
+        * to 0 when log is read.
+        */
+       memset(ppd->threshold_cong_event_map, 0x0,
+              sizeof(ppd->threshold_cong_event_map));
+       ppd->threshold_event_counter = 0;
+
+       spin_unlock_irq(&ppd->cc_log_lock);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_hfi1_cong_log);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *cc_table_attr =
+               (struct ib_cc_table_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       struct cc_state *cc_state;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+       cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+       entries = cc_state->cct.entries;
+
+       /* return n_blocks, though the last block may not be full */
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               cc_table_attr->ccti_entries[j].entry =
+                       cpu_to_be16(entries[i].entry);
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+void cc_state_reclaim(struct rcu_head *rcu)
+{
+       struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
+
+       kfree(cc_state);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       u16 ccti_limit;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+                (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+       /* sanity check ccti_limit */
+       ccti_limit = be16_to_cpu(p->ccti_limit);
+       if (ccti_limit + 1 > eentry) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /*
+        * Save details from packet into the ppd.  Hold the cc_state_lock so
+        * our information is consistent with anyone trying to apply the state.
+        */
+       spin_lock(&ppd->cc_state_lock);
+       ppd->total_cct_entry = ccti_limit + 1;
+       entries = ppd->ccti_entries;
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+       spin_unlock(&ppd->cc_state_lock);
+
+       /* now apply the information */
+       apply_cc_state(ppd);
+
+       return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+       __be32 rsvd_led_mask;
+       __be32 rsvd;
+};
+
+#define OPA_LED_SHIFT  31
+#define OPA_LED_MASK   BIT(OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd = dd->pport;
+       struct opa_led_info *p = (struct opa_led_info *)data;
+       u32 nport = OPA_AM_NPORT(am);
+       u32 is_beaconing_active;
+
+       if (nport != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /*
+        * This pairs with the memory barrier in hfi1_start_led_override to
+        * ensure that we read the correct state of LED beaconing represented
+        * by led_override_timer_active
+        */
+       smp_rmb();
+       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+       p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_led_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_led_info *p = (struct opa_led_info *)data;
+       u32 nport = OPA_AM_NPORT(am);
+       int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+       if (nport != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       if (on)
+               hfi1_start_led_override(dd->pport, 2000, 1500);
+       else
+               shutdown_led_override(dd->pport);
+
+       return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_NODE_DESC:
+               ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_NODE_INFO:
+               ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_CABLE_INFO:
+               ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_INFO:
+               ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+               ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+                                                  port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+       ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               /* zero the payload for this segment */
+               memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+               (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ *   PortRcvErrors [*]
+ *   LinkErrorRecovery
+ *   LocalLinkIntegrityErrors
+ *   ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+       /* PortRcvErrors */
+       write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+       dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+       /* LinkErrorRecovery */
+       write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+       /* LocalLinkIntegrityErrors */
+       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       /* ExcessiveBufferOverruns */
+       write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+       dd->rcv_ovfl_cnt = 0;
+       dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+                       const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               return (smp->hop_cnt == 0 &&
+                       smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+                       smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+       }
+
+       return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+                              const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 slid = in_wc->slid;
+       u16 pkey;
+
+       if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+               return 1;
+
+       pkey = ppd->pkeys[in_wc->pkey_index];
+       /*
+        * We need to do the "node-local" checks specified in OPAv1,
+        * rev 0.90, section 9.10.26, which are:
+        *   - pkey is 0x7fff, or 0xffff
+        *   - Source QPN == 0 || Destination QPN == 0
+        *   - the MAD header's management class is either
+        *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+        *     IB_MGMT_CLASS_SUBN_LID_ROUTED
+        *   - SLID != 0
+        *
+        * However, we know (and so don't need to check again) that,
+        * for local SMPs, the MAD stack passes MADs with:
+        *   - Source QPN of 0
+        *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+        *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+        *     our own port's lid
+        *
+        */
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+               return 0;
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+                           u8 port, const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad,
+                           u32 *resp_len)
+{
+       struct opa_smp *smp = (struct opa_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *data;
+       u32 am;
+       __be16 attr_id;
+       int ret;
+
+       *out_mad = *in_mad;
+       data = opa_get_smp_data(smp);
+
+       am = be32_to_cpu(smp->attr_mod);
+       attr_id = smp->attr_id;
+       if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               return ret;
+       }
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+                        smp->route.dr.dr_slid, smp->route.dr.return_path,
+                        smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void)check_mkey(to_iport(ibdev, port_num),
+                                         (struct ib_mad_hdr *)smp, 0,
+                                         smp->mkey, smp->route.dr.dr_slid,
+                                         smp->route.dr.return_path,
+                                         smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               return ret;
+       }
+
+       *resp_len = opa_get_smp_header_size(smp);
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (attr_id) {
+               default:
+                       clear_opa_smp_data(smp);
+                       ret = subn_get_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       break;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_get_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       break;
+               }
+               break;
+       case IB_MGMT_METHOD_SET:
+               switch (attr_id) {
+               default:
+                       ret = subn_set_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       break;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_set_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       break;
+               }
+               break;
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_REPORT_RESP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+       default:
+               smp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+
+       return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+                       u8 port, const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_smp *smp = (struct ib_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       int ret;
+
+       *out_mad = *in_mad;
+       if (smp->class_version != 1) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               return ret;
+       }
+
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+                        smp->mkey, (__force __be32)smp->dr_slid,
+                        smp->return_path, smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void)check_mkey(to_iport(ibdev, port_num),
+                                        (struct ib_mad_hdr *)smp, 0,
+                                        smp->mkey,
+                                        (__force __be32)smp->dr_slid,
+                                        smp->return_path, smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               return ret;
+       }
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_NODE_INFO:
+                       ret = subn_get_nodeinfo(smp, ibdev, port);
+                       break;
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)smp);
+                       break;
+               }
+               break;
+       }
+
+       return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+                       const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+       struct ib_class_port_info *cpi = (struct ib_class_port_info *)
+                                               &pmp->data;
+       int ret = IB_MAD_RESULT_FAILURE;
+
+       *out_mad = *in_mad;
+       if (pmp->mad_hdr.class_version != 1) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               return ret;
+       }
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_PORT_COUNTERS:
+                       ret = pma_get_ib_portcounters(pmp, ibdev, port);
+                       break;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
+                       break;
+               case IB_PMA_CLASS_PORT_INFO:
+                       cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_SET:
+               if (pmp->mad_hdr.attr_id) {
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+               }
+               break;
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               break;
+       }
+
+       return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+                           const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad, u32 *resp_len)
+{
+       struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+       int ret;
+
+       *out_mad = *in_mad;
+
+       if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       *resp_len = sizeof(pmp->mad_hdr);
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_CLASS_PORT_INFO:
+                       ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_PORT_STATUS:
+                       ret = pma_get_opa_portstatus(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+                       ret = pma_get_opa_datacounters(pmp, ibdev, port,
+                                                      resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+                       ret = pma_get_opa_porterrors(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+                                                   resp_len);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_SET:
+               switch (pmp->mad_hdr.attr_id) {
+               case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+                       ret = pma_set_opa_portstatus(pmp, ibdev, port,
+                                                    resp_len);
+                       break;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+                                                   resp_len);
+                       break;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       break;
+               }
+               break;
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+               break;
+       }
+
+       return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+                               u8 port, const struct ib_wc *in_wc,
+                               const struct ib_grh *in_grh,
+                               const struct opa_mad *in_mad,
+                               struct opa_mad *out_mad, size_t *out_mad_size,
+                               u16 *out_mad_pkey_index)
+{
+       int ret;
+       int pkey_idx;
+       u32 resp_len = 0;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+       *out_mad_pkey_index = (u16)pkey_idx;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               if (is_local_mad(ibp, in_mad, in_wc)) {
+                       ret = opa_local_smp_check(ibp, in_wc);
+                       if (ret)
+                               return IB_MAD_RESULT_FAILURE;
+               }
+               ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+                                      out_mad, &resp_len);
+               goto bail;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+                                      &resp_len);
+               goto bail;
+
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       if (ret & IB_MAD_RESULT_REPLY)
+               *out_mad_size = round_up(resp_len, 8);
+       else if (ret & IB_MAD_RESULT_SUCCESS)
+               *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+       return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                              const struct ib_wc *in_wc,
+                              const struct ib_grh *in_grh,
+                              const struct ib_mad *in_mad,
+                              struct ib_mad *out_mad)
+{
+       int ret;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+               break;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf(ibdev, port, in_mad, out_mad);
+               break;
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+               break;
+       }
+
+       return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index)
+{
+       switch (in_mad->base_version) {
+       case OPA_MGMT_BASE_VERSION:
+               if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+                       dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+                       return IB_MAD_RESULT_FAILURE;
+               }
+               return hfi1_process_opa_mad(ibdev, mad_flags, port,
+                                           in_wc, in_grh,
+                                           (struct opa_mad *)in_mad,
+                                           (struct opa_mad *)out_mad,
+                                           out_mad_size,
+                                           out_mad_pkey_index);
+       case IB_MGMT_BASE_VERSION:
+               return hfi1_process_ib_mad(ibdev, mad_flags, port,
+                                         in_wc, in_grh,
+                                         (const struct ib_mad *)in_mad,
+                                         (struct ib_mad *)out_mad);
+       default:
+               break;
+       }
+
+       return IB_MAD_RESULT_FAILURE;
+}
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
new file mode 100644 (file)
index 0000000..55ee086
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#define USE_PI_LED_ENABLE      1 /*
+                                  * use led enabled bit in struct
+                                  * opa_port_states, if available
+                                  */
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#ifndef PI_LED_ENABLE_SUP
+#define PI_LED_ENABLE_SUP 0
+#endif
+#include "opa_compat.h"
+
+/*
+ * OPA Traps
+ */
+#define OPA_TRAP_GID_NOW_IN_SERVICE             cpu_to_be16(64)
+#define OPA_TRAP_GID_OUT_OF_SERVICE             cpu_to_be16(65)
+#define OPA_TRAP_ADD_MULTICAST_GROUP            cpu_to_be16(66)
+#define OPA_TRAL_DEL_MULTICAST_GROUP            cpu_to_be16(67)
+#define OPA_TRAP_UNPATH                         cpu_to_be16(68)
+#define OPA_TRAP_REPATH                         cpu_to_be16(69)
+#define OPA_TRAP_PORT_CHANGE_STATE              cpu_to_be16(128)
+#define OPA_TRAP_LINK_INTEGRITY                 cpu_to_be16(129)
+#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN       cpu_to_be16(130)
+#define OPA_TRAP_FLOW_WATCHDOG                  cpu_to_be16(131)
+#define OPA_TRAP_CHANGE_CAPABILITY              cpu_to_be16(144)
+#define OPA_TRAP_CHANGE_SYSGUID                 cpu_to_be16(145)
+#define OPA_TRAP_BAD_M_KEY                      cpu_to_be16(256)
+#define OPA_TRAP_BAD_P_KEY                      cpu_to_be16(257)
+#define OPA_TRAP_BAD_Q_KEY                      cpu_to_be16(258)
+#define OPA_TRAP_SWITCH_BAD_PKEY                cpu_to_be16(259)
+#define OPA_SMA_TRAP_DATA_LINK_WIDTH            cpu_to_be16(2048)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define        OPA_NOTICE_TRAP_LWDE_CHG        0x08 /* Link Width Downgrade Enable
+                                             * changed
+                                             */
+#define OPA_NOTICE_TRAP_LSE_CHG         0x04 /* Link Speed Enable changed */
+#define OPA_NOTICE_TRAP_LWE_CHG         0x02 /* Link Width Enable changed */
+#define OPA_NOTICE_TRAP_NODE_DESC_CHG   0x01
+
+struct opa_mad_notice_attr {
+       u8 generic_type;
+       u8 prod_type_msb;
+       __be16 prod_type_lsb;
+       __be16 trap_num;
+       __be16 toggle_count;
+       __be32 issuer_lid;
+       __be32 reserved1;
+       union ib_gid issuer_gid;
+
+       union {
+               struct {
+                       u8      details[64];
+               } raw_data;
+
+               struct {
+                       union ib_gid    gid;
+               } __packed ntc_64_65_66_67;
+
+               struct {
+                       __be32  lid;
+               } __packed ntc_128;
+
+               struct {
+                       __be32  lid;            /* where violation happened */
+                       u8      port_num;       /* where violation happened */
+               } __packed ntc_129_130_131;
+
+               struct {
+                       __be32  lid;            /* LID where change occurred */
+                       __be32  new_cap_mask;   /* new capability mask */
+                       __be16  reserved2;
+                       __be16  cap_mask;
+                       __be16  change_flags;   /* low 4 bits only */
+               } __packed ntc_144;
+
+               struct {
+                       __be64  new_sys_guid;
+                       __be32  lid;            /* lid where sys guid changed */
+               } __packed ntc_145;
+
+               struct {
+                       __be32  lid;
+                       __be32  dr_slid;
+                       u8      method;
+                       u8      dr_trunc_hop;
+                       __be16  attr_id;
+                       __be32  attr_mod;
+                       __be64  mkey;
+                       u8      dr_rtn_path[30];
+               } __packed ntc_256;
+
+               struct {
+                       __be32          lid1;
+                       __be32          lid2;
+                       __be32          key;
+                       u8              sl;     /* SL: high 5 bits */
+                       u8              reserved3[3];
+                       union ib_gid    gid1;
+                       union ib_gid    gid2;
+                       __be32          qp1;    /* high 8 bits reserved */
+                       __be32          qp2;    /* high 8 bits reserved */
+               } __packed ntc_257_258;
+
+               struct {
+                       __be16          flags;  /* low 8 bits reserved */
+                       __be16          pkey;
+                       __be32          lid1;
+                       __be32          lid2;
+                       u8              sl;     /* SL: high 5 bits */
+                       u8              reserved4[3];
+                       union ib_gid    gid1;
+                       union ib_gid    gid2;
+                       __be32          qp1;    /* high 8 bits reserved */
+                       __be32          qp2;    /* high 8 bits reserved */
+               } __packed ntc_259;
+
+               struct {
+                       __be32  lid;
+               } __packed ntc_2048;
+
+       };
+       u8      class_data[0];
+};
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+       u8 reserved;
+       u8 reserved1;
+       __be16 port_check_rate;
+       __be16 symbol_error_counter;
+       u8 link_error_recovery_counter;
+       u8 link_downed_counter;
+       __be16 port_rcv_errors;
+       __be16 port_rcv_remphys_errors;
+       __be16 port_rcv_switch_relay_errors;
+       __be16 port_xmit_discards;
+       u8 port_xmit_constraint_errors;
+       u8 port_rcv_constraint_errors;
+       u8 reserved2;
+       u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       __be16 reserved3;
+       __be16 vl15_dropped;
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_packets;
+       __be64 port_rcv_packets;
+       __be64 port_xmit_wait;
+       __be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
+#define HFI1_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI    2
+
+struct opa_hfi1_cong_log_event_internal {
+       u32 lqpn;
+       u32 rqpn;
+       u8 sl;
+       u8 svc_type;
+       u32 rlid;
+       s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+       u8 local_qp_cn_entry[3];
+       u8 remote_qp_number_cn_entry[3];
+       u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+       u8 reserved;
+       __be32 remote_lid_cn_entry;
+       __be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS     96
+
+struct opa_hfi1_cong_log {
+       u8 log_type;
+       u8 congestion_flags;
+       __be16 threshold_event_counter;
+       __be32 current_time_stamp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+       struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+       u8 ccti_increase;
+       u8 reserved;
+       __be16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+       u8 ccti_increase;
+       u8 reserved;
+       u16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+       __be32 control_map;
+       __be16 port_control;
+       struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+       u32 control_map;
+       u16 port_control;
+       struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+       __be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+       u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+       __be16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+       (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+       struct rcu_head rcu;
+       struct cc_table_shadow cct;
+       struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT     24
+#define OPA_AM_NPORT_MASK      0xff
+#define OPA_AM_NPORT_SMASK     (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am)       (((am) >> OPA_AM_NPORT_SHIFT) & \
+                                       OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT      24
+#define OPA_AM_NBLK_MASK       0xff
+#define OPA_AM_NBLK_SMASK      (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am)                (((am) >> OPA_AM_NBLK_SHIFT) & \
+                                       OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT 0
+#define OPA_AM_START_BLK_MASK  0xff
+#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
+                                       OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am)   (((am) >> OPA_AM_START_BLK_SHIFT) & \
+                                       OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT   0
+#define OPA_AM_PORTNUM_MASK    0xff
+#define OPA_AM_PORTNUM_SMASK   (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am)     (((am) >> OPA_AM_PORTNUM_SHIFT) & \
+                                       OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT     12
+#define OPA_AM_ASYNC_MASK      0x1
+#define OPA_AM_ASYNC_SMASK     (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am)       (((am) >> OPA_AM_ASYNC_SHIFT) & \
+                                       OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT      9
+#define OPA_AM_START_SM_CFG_MASK       0x1
+#define OPA_AM_START_SM_CFG_SMASK      (OPA_AM_START_SM_CFG_MASK << \
+                                               OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am)                (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+                                               & OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT   19
+#define OPA_AM_CI_ADDR_MASK    0xfff
+#define OPA_AM_CI_ADDR_SMASK   (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am)     (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+                                       OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT    13
+#define OPA_AM_CI_LEN_MASK     0x3f
+#define OPA_AM_CI_LEN_SMASK    (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am)      (((am) >> OPA_AM_CI_LEN_SHIFT) & \
+                                       OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK    0x80
+#define OPA_EI_CODE_SMASK      0x0f
+
+struct vl_limit {
+       __be16 dedicated;
+       __be16 shared;
+};
+
+struct buffer_control {
+       __be16 reserved;
+       __be16 overall_shared_limit;
+       struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+       u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+       cpu_to_be32(COUNTER_MASK(1, 0) | \
+                   COUNTER_MASK(1, 1) | \
+                   COUNTER_MASK(1, 2) | \
+                   COUNTER_MASK(1, 3) | \
+                   COUNTER_MASK(1, 4))
+
+#endif                         /* _HFI1_MAD_H */
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
new file mode 100644 (file)
index 0000000..b7a80aa
--- /dev/null
@@ -0,0 +1,325 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/interval_tree_generic.h>
+
+#include "mmu_rb.h"
+#include "trace.h"
+
+struct mmu_rb_handler {
+       struct list_head list;
+       struct mmu_notifier mn;
+       struct rb_root *root;
+       spinlock_t lock;        /* protect the RB tree */
+       struct mmu_rb_ops *ops;
+};
+
+static LIST_HEAD(mmu_rb_handlers);
+static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
+
+static unsigned long mmu_node_start(struct mmu_rb_node *);
+static unsigned long mmu_node_last(struct mmu_rb_node *);
+static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
+static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
+                                    unsigned long);
+static inline void mmu_notifier_range_start(struct mmu_notifier *,
+                                           struct mm_struct *,
+                                           unsigned long, unsigned long);
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
+                                       struct mm_struct *,
+                                       unsigned long, unsigned long);
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
+                                          unsigned long, unsigned long);
+
+static struct mmu_notifier_ops mn_opts = {
+       .invalidate_page = mmu_notifier_page,
+       .invalidate_range_start = mmu_notifier_range_start,
+};
+
+INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
+                    mmu_node_start, mmu_node_last, static, __mmu_int_rb);
+
+static unsigned long mmu_node_start(struct mmu_rb_node *node)
+{
+       return node->addr & PAGE_MASK;
+}
+
+static unsigned long mmu_node_last(struct mmu_rb_node *node)
+{
+       return PAGE_ALIGN(node->addr + node->len) - 1;
+}
+
+int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
+{
+       struct mmu_rb_handler *handlr;
+
+       if (!ops->invalidate)
+               return -EINVAL;
+
+       handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
+       if (!handlr)
+               return -ENOMEM;
+
+       handlr->root = root;
+       handlr->ops = ops;
+       INIT_HLIST_NODE(&handlr->mn.hlist);
+       spin_lock_init(&handlr->lock);
+       handlr->mn.ops = &mn_opts;
+       spin_lock(&mmu_rb_lock);
+       list_add_tail_rcu(&handlr->list, &mmu_rb_handlers);
+       spin_unlock(&mmu_rb_lock);
+
+       return mmu_notifier_register(&handlr->mn, current->mm);
+}
+
+void hfi1_mmu_rb_unregister(struct rb_root *root)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       unsigned long flags;
+
+       if (!handler)
+               return;
+
+       /* Unregister first so we don't get any more notifications. */
+       if (current->mm)
+               mmu_notifier_unregister(&handler->mn, current->mm);
+
+       spin_lock(&mmu_rb_lock);
+       list_del_rcu(&handler->list);
+       spin_unlock(&mmu_rb_lock);
+       synchronize_rcu();
+
+       spin_lock_irqsave(&handler->lock, flags);
+       if (!RB_EMPTY_ROOT(root)) {
+               struct rb_node *node;
+               struct mmu_rb_node *rbnode;
+
+               while ((node = rb_first(root))) {
+                       rbnode = rb_entry(node, struct mmu_rb_node, node);
+                       rb_erase(node, root);
+                       if (handler->ops->remove)
+                               handler->ops->remove(root, rbnode, NULL);
+               }
+       }
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       kfree(handler);
+}
+
+int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!handler)
+               return -EINVAL;
+
+       spin_lock_irqsave(&handler->lock, flags);
+       hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
+                 mnode->len);
+       node = __mmu_rb_search(handler, mnode->addr, mnode->len);
+       if (node) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+       __mmu_int_rb_insert(mnode, root);
+
+       if (handler->ops->insert) {
+               ret = handler->ops->insert(root, mnode);
+               if (ret)
+                       __mmu_int_rb_remove(mnode, root);
+       }
+unlock:
+       spin_unlock_irqrestore(&handler->lock, flags);
+       return ret;
+}
+
+/* Caller must hold handler lock */
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
+                                          unsigned long addr,
+                                          unsigned long len)
+{
+       struct mmu_rb_node *node = NULL;
+
+       hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
+       if (!handler->ops->filter) {
+               node = __mmu_int_rb_iter_first(handler->root, addr,
+                                              (addr + len) - 1);
+       } else {
+               for (node = __mmu_int_rb_iter_first(handler->root, addr,
+                                                   (addr + len) - 1);
+                    node;
+                    node = __mmu_int_rb_iter_next(node, addr,
+                                                  (addr + len) - 1)) {
+                       if (handler->ops->filter(node, addr, len))
+                               return node;
+               }
+       }
+       return node;
+}
+
+/* Caller must *not* hold handler lock. */
+static void __mmu_rb_remove(struct mmu_rb_handler *handler,
+                           struct mmu_rb_node *node, struct mm_struct *mm)
+{
+       unsigned long flags;
+
+       /* Validity of handler and node pointers has been checked by caller. */
+       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
+                 node->len);
+       spin_lock_irqsave(&handler->lock, flags);
+       __mmu_int_rb_remove(node, handler->root);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       if (handler->ops->remove)
+               handler->ops->remove(handler->root, node, mm);
+}
+
+struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
+                                      unsigned long len)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+
+       if (!handler)
+               return ERR_PTR(-EINVAL);
+
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       return node;
+}
+
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
+                                       unsigned long addr, unsigned long len)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *node;
+       unsigned long flags;
+
+       if (!handler)
+               return ERR_PTR(-EINVAL);
+
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       if (node)
+               __mmu_int_rb_remove(node, handler->root);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       return node;
+}
+
+void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
+{
+       struct mmu_rb_handler *handler = find_mmu_handler(root);
+
+       if (!handler || !node)
+               return;
+
+       __mmu_rb_remove(handler, node, NULL);
+}
+
+static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
+{
+       struct mmu_rb_handler *handler;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) {
+               if (handler->root == root)
+                       goto unlock;
+       }
+       handler = NULL;
+unlock:
+       rcu_read_unlock();
+       return handler;
+}
+
+static inline void mmu_notifier_page(struct mmu_notifier *mn,
+                                    struct mm_struct *mm, unsigned long addr)
+{
+       mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
+}
+
+static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
+                                           struct mm_struct *mm,
+                                           unsigned long start,
+                                           unsigned long end)
+{
+       mmu_notifier_mem_invalidate(mn, mm, start, end);
+}
+
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start, unsigned long end)
+{
+       struct mmu_rb_handler *handler =
+               container_of(mn, struct mmu_rb_handler, mn);
+       struct rb_root *root = handler->root;
+       struct mmu_rb_node *node, *ptr = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&handler->lock, flags);
+       for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+            node; node = ptr) {
+               /* Guard against node removal. */
+               ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+               hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
+                         node->addr, node->len);
+               if (handler->ops->invalidate(root, node)) {
+                       __mmu_int_rb_remove(node, root);
+                       if (handler->ops->remove)
+                               handler->ops->remove(root, node, mm);
+               }
+       }
+       spin_unlock_irqrestore(&handler->lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
new file mode 100644 (file)
index 0000000..7a57b9c
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MMU_RB_H
+#define _HFI1_MMU_RB_H
+
+#include "hfi.h"
+
+struct mmu_rb_node {
+       unsigned long addr;
+       unsigned long len;
+       unsigned long __last;
+       struct rb_node node;
+};
+
+struct mmu_rb_ops {
+       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
+       int (*insert)(struct rb_root *, struct mmu_rb_node *);
+       void (*remove)(struct rb_root *, struct mmu_rb_node *,
+                      struct mm_struct *);
+       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
+};
+
+int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
+void hfi1_mmu_rb_unregister(struct rb_root *);
+int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
+void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
+struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
+                                      unsigned long);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
+                                       unsigned long);
+
+#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h
new file mode 100644 (file)
index 0000000..6ef3c1c
--- /dev/null
@@ -0,0 +1,111 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO          cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG       cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING   cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS           cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS     cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS    cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS   cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO            cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE                cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+       return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+       return ((ps->portphysstate_portstate &
+                 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+       IB_PORTPHYSSTATE_NOP = 0,
+       /* 1 is reserved */
+       IB_PORTPHYSSTATE_POLLING = 2,
+       IB_PORTPHYSSTATE_DISABLED = 3,
+       IB_PORTPHYSSTATE_TRAINING = 4,
+       IB_PORTPHYSSTATE_LINKUP = 5,
+       IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+       IB_PORTPHYSSTATE_PHY_TEST = 7,
+       /* 8 is reserved */
+       OPA_PORTPHYSSTATE_OFFLINE = 9,
+       OPA_PORTPHYSSTATE_GANGED = 10,
+       OPA_PORTPHYSSTATE_TEST = 11,
+       OPA_PORTPHYSSTATE_MAX = 11,
+       /* values 12-15 are reserved/ignored */
+};
+
+#endif /* _LINUX_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
new file mode 100644 (file)
index 0000000..0bac21e
--- /dev/null
@@ -0,0 +1,1338 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+#include "aspm.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret;
+
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               /*
+                * This can happen (in theory) iff:
+                * We did a chip reset, and then failed to reprogram the
+                * BAR, or the chip reset due to an internal error.  We then
+                * unloaded the driver and reloaded it.
+                *
+                * Both reset cases set the BAR back to initial state.  For
+                * the latter case, the AER sticky error bit at offset 0x718
+                * should be set, but the Linux kernel doesn't yet know
+                * about that, it appears.  If the original BAR was retained
+                * in the kernel data structures, this may be OK.
+                */
+               hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+                              -ret);
+               goto done;
+       }
+
+       ret = pci_request_regions(pdev, DRIVER_NAME);
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "pci_request_regions fails: err %d\n", -ret);
+               goto bail;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               /*
+                * If the 64 bit setup fails, try 32 bit.  Some systems
+                * do not setup 64 bit maps on systems with 2GB or less
+                * memory installed.
+                */
+               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (ret) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Unable to set DMA mask: %d\n", ret);
+                       goto bail;
+               }
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+       } else {
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       }
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "Unable to set DMA consistent mask: %d\n", ret);
+               goto bail;
+       }
+
+       pci_set_master(pdev);
+       (void)pci_enable_pcie_error_reporting(pdev);
+       goto done;
+
+bail:
+       hfi1_pcie_cleanup(pdev);
+done:
+       return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+       pci_disable_device(pdev);
+       /*
+        * Release regions should be called after the disable. OK to
+        * call if request regions has not been called or failed.
+        */
+       pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+                    const struct pci_device_id *ent)
+{
+       unsigned long len;
+       resource_size_t addr;
+
+       dd->pcidev = pdev;
+       pci_set_drvdata(pdev, dd);
+
+       addr = pci_resource_start(pdev, 0);
+       len = pci_resource_len(pdev, 0);
+
+       /*
+        * The TXE PIO buffers are at the tail end of the chip space.
+        * Cut them off and map them separately.
+        */
+
+       /* sanity check vs expectations */
+       if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+               dd_dev_err(dd, "chip PIO range does not match\n");
+               return -EINVAL;
+       }
+
+       dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+       if (!dd->kregbase)
+               return -ENOMEM;
+
+       dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+       if (!dd->piobase) {
+               iounmap(dd->kregbase);
+               return -ENOMEM;
+       }
+
+       dd->flags |= HFI1_PRESENT;      /* now register routines work */
+
+       dd->kregend = dd->kregbase + TXE_PIO_SEND;
+       dd->physaddr = addr;        /* used for io_remap, etc. */
+
+       /*
+        * Re-map the chip's RcvArray as write-combining to allow us
+        * to write an entire cacheline worth of entries in one shot.
+        * If this re-map fails, just continue - the RcvArray programming
+        * function will handle both cases.
+        */
+       dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+       dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+                                    dd->chip_rcv_array_count * 8);
+       dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+       /*
+        * Save BARs and command to rewrite after device reset.
+        */
+       dd->pcibar0 = addr;
+       dd->pcibar1 = addr >> 32;
+       pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+       pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                 &dd->pcie_devctl2);
+       pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+       return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+       u64 __iomem *base = (void __iomem *)dd->kregbase;
+
+       dd->flags &= ~HFI1_PRESENT;
+       dd->kregbase = NULL;
+       iounmap(base);
+       if (dd->rcvarray_wc)
+               iounmap(dd->rcvarray_wc);
+       if (dd->piobase)
+               iounmap(dd->piobase);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+       int i;
+       u16 status;
+
+       /* no need to check for the capability - we know the device has it */
+
+       /* wait for Transaction Pending bit to clear, at most a few ms */
+       for (i = 0; i < 4; i++) {
+               if (i)
+                       msleep((1 << (i - 1)) * 100);
+
+               pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+               if (!(status & PCI_EXP_DEVSTA_TRPND))
+                       goto clear;
+       }
+
+       dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+       pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+                                PCI_EXP_DEVCTL_BCR_FLR);
+       /* PCIe spec requires the function to be back within 100ms */
+       msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+                      struct hfi1_msix_entry *hfi1_msix_entry)
+{
+       int ret;
+       int nvec = *msixcnt;
+       struct msix_entry *msix_entry;
+       int i;
+
+       /*
+        * We can't pass hfi1_msix_entry array to msix_setup
+        * so use a dummy msix_entry array and copy the allocated
+        * irq back to the hfi1_msix_entry array.
+        */
+       msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+       if (!msix_entry) {
+               ret = -ENOMEM;
+               goto do_intx;
+       }
+
+       for (i = 0; i < nvec; i++)
+               msix_entry[i] = hfi1_msix_entry[i].msix;
+
+       ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+       if (ret < 0)
+               goto free_msix_entry;
+       nvec = ret;
+
+       for (i = 0; i < nvec; i++)
+               hfi1_msix_entry[i].msix = msix_entry[i];
+
+       kfree(msix_entry);
+       *msixcnt = nvec;
+       return;
+
+free_msix_entry:
+       kfree(msix_entry);
+
+do_intx:
+       dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+                  nvec, ret);
+       *msixcnt = 0;
+       hfi1_enable_intx(dd->pcidev);
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+       u32 speed;
+
+       switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+       default: /* not defined, assume Gen1 */
+       case PCI_EXP_LNKSTA_CLS_2_5GB:
+               speed = 2500; /* Gen 1, 2.5GHz */
+               break;
+       case PCI_EXP_LNKSTA_CLS_5_0GB:
+               speed = 5000; /* Gen 2, 5GHz */
+               break;
+       case GEN3_SPEED_VECTOR:
+               speed = 8000; /* Gen 3, 8GHz */
+               break;
+       }
+       return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+       return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+       u16 linkstat;
+
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+       dd->lbus_width = extract_width(linkstat);
+       dd->lbus_speed = extract_speed(linkstat);
+       snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+                "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed.  Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+       u32 linkcap;
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       if (!pci_is_pcie(dd->pcidev)) {
+               dd_dev_err(dd, "Can't find PCI Express capability!\n");
+               return -EINVAL;
+       }
+
+       /* find if our max speed is Gen3 and parent supports Gen3 speeds */
+       dd->link_gen3_capable = 1;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+       if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+               dd_dev_info(dd,
+                           "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+                           linkcap & PCI_EXP_LNKCAP_SLS);
+               dd->link_gen3_capable = 0;
+       }
+
+       /*
+        * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+        */
+       if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+               dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+               dd->link_gen3_capable = 0;
+       }
+
+       /* obtain the link width and current speed */
+       update_lbus_info(dd);
+
+       dd_dev_info(dd, "%s\n", dd->lbus_info);
+
+       return 0;
+}
+
+/*
+ * Returns in *nent:
+ *     - actual number of interrupts allocated
+ *     - 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+                 struct hfi1_msix_entry *entry)
+{
+       int pos;
+
+       pos = dd->pcidev->msix_cap;
+       if (*nent && pos) {
+               msix_setup(dd, pos, nent, entry);
+               /* did it, either MSI-X or INTx */
+       } else {
+               *nent = 0;
+               hfi1_enable_intx(dd->pcidev);
+       }
+
+       tune_pcie_caps(dd);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+       /* first, turn on INTx */
+       pci_intx(pdev, 1);
+       /* then turn off MSI-X */
+       pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+       pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
+       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
+       pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                  dd->pcie_devctl2);
+       pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent;
+       u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+       u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
+
+       /*
+        * Turn on extended tags in DevCtl in case the BIOS has turned it off
+        * to improve WFR SDMA bandwidth
+        */
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
+       if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
+               dd_dev_info(dd, "Enabling PCIe extended tags\n");
+               ectl |= PCI_EXP_DEVCTL_EXT_TAG;
+               pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
+       }
+       /* Find out supported and configured values for parent (root) */
+       parent = dd->pcidev->bus->self;
+       /*
+        * The driver cannot perform the tuning if it does not have
+        * access to the upstream component.
+        */
+       if (!parent)
+               return;
+       if (!pci_is_root_bus(parent->bus)) {
+               dd_dev_info(dd, "Parent not root\n");
+               return;
+       }
+
+       if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+               return;
+       rc_mpss = parent->pcie_mpss;
+       rc_mps = ffs(pcie_get_mps(parent)) - 8;
+       /* Find out supported and configured values for endpoint (us) */
+       ep_mpss = dd->pcidev->pcie_mpss;
+       ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+       /* Find max payload supported by root, endpoint */
+       if (rc_mpss > ep_mpss)
+               rc_mpss = ep_mpss;
+
+       /* If Supported greater than limit in module param, limit it */
+       if (rc_mpss > (hfi1_pcie_caps & 7))
+               rc_mpss = hfi1_pcie_caps & 7;
+       /* If less than (allowed, supported), bump root payload */
+       if (rc_mpss > rc_mps) {
+               rc_mps = rc_mpss;
+               pcie_set_mps(parent, 128 << rc_mps);
+       }
+       /* If less than (allowed, supported), bump endpoint payload */
+       if (rc_mpss > ep_mps) {
+               ep_mps = rc_mpss;
+               pcie_set_mps(dd->pcidev, 128 << ep_mps);
+       }
+
+       /*
+        * Now the Read Request size.
+        * No field for max supported, but PCIe spec limits it to 4096,
+        * which is code '5' (log2(4096) - 7)
+        */
+       max_mrrs = 5;
+       if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+               max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+       max_mrrs = 128 << max_mrrs;
+       rc_mrrs = pcie_get_readrq(parent);
+       ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+       if (max_mrrs > rc_mrrs) {
+               rc_mrrs = max_mrrs;
+               pcie_set_readrq(parent, rc_mrrs);
+       }
+       if (max_mrrs > ep_mrrs) {
+               ep_mrrs = max_mrrs;
+               pcie_set_readrq(dd->pcidev, ep_mrrs);
+       }
+}
+
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       switch (state) {
+       case pci_channel_io_normal:
+               dd_dev_info(dd, "State Normal, ignoring\n");
+               break;
+
+       case pci_channel_io_frozen:
+               dd_dev_info(dd, "State Frozen, requesting reset\n");
+               pci_disable_device(pdev);
+               ret = PCI_ERS_RESULT_NEED_RESET;
+               break;
+
+       case pci_channel_io_perm_failure:
+               if (dd) {
+                       dd_dev_info(dd, "State Permanent Failure, disabling\n");
+                       /* no more register accesses! */
+                       dd->flags &= ~HFI1_PRESENT;
+                       hfi1_disable_after_error(dd);
+               }
+                /* else early, or other problem */
+               ret =  PCI_ERS_RESULT_DISCONNECT;
+               break;
+
+       default: /* shouldn't happen */
+               dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+                           state);
+               break;
+       }
+       return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+       u64 words = 0U;
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       if (dd && dd->pport) {
+               words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+               if (words == ~0ULL)
+                       ret = PCI_ERS_RESULT_NEED_RESET;
+               dd_dev_info(dd,
+                           "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+                           words, ret);
+       }
+       return  ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 resume function called\n");
+       pci_cleanup_aer_uncorrect_error_status(pdev);
+       /*
+        * Running jobs will fail, since it's asynchronous
+        * unlike sysfs-requested reset.   Better than
+        * doing nothing.
+        */
+       hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+       .error_detected = pci_error_detected,
+       .mmio_enabled = pci_mmio_enabled,
+       .link_reset = pci_link_reset,
+       .slot_reset = pci_slot_reset,
+       .resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product.  If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1     /* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2     /* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3     /* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE            0x0     /* no error */
+#define DL_ERR_SWAP_PARITY     0x1     /* parity error in SerDes interrupt */
+                                       /*   or response data */
+#define DL_ERR_DISABLED        0x2     /* hfi disabled */
+#define DL_ERR_SECURITY        0x3     /* security check failed */
+#define DL_ERR_SBUS            0x4     /* SBus status error */
+#define DL_ERR_XFR_PARITY      0x5     /* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000    /* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2        /* discrete HFI */
+#define DEFAULT_MCP_PSET 4     /* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x00,  0x12 },        /* p0 */
+       {  0x00,  0x00,  0x0c },        /* p1 */
+       {  0x00,  0x00,  0x0f },        /* p2 */
+       {  0x00,  0x00,  0x09 },        /* p3 */
+       {  0x00,  0x00,  0x00 },        /* p4 */
+       {  0x06,  0x00,  0x00 },        /* p5 */
+       {  0x09,  0x00,  0x00 },        /* p6 */
+       {  0x06,  0x00,  0x0f },        /* p7 */
+       {  0x09,  0x00,  0x09 },        /* p8 */
+       {  0x0c,  0x00,  0x00 },        /* p9 */
+       {  0x00,  0x00,  0x18 },        /* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x1e,  0x07 },        /* p0 */
+       {  0x00,  0x1e,  0x05 },        /* p1 */
+       {  0x00,  0x1e,  0x06 },        /* p2 */
+       {  0x00,  0x1e,  0x04 },        /* p3 */
+       {  0x00,  0x1e,  0x00 },        /* p4 */
+       {  0x03,  0x1e,  0x00 },        /* p5 */
+       {  0x04,  0x1e,  0x00 },        /* p6 */
+       {  0x03,  0x1e,  0x06 },        /* p7 */
+       {  0x03,  0x1e,  0x04 },        /* p8 */
+       {  0x05,  0x1e,  0x00 },        /* p9 */
+       {  0x00,  0x1e,  0x0a },        /* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+       ((((u32)(pre)) << \
+                       PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+       | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+       | (((u32)(post)) << \
+               PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+                        u8 div)
+{
+       struct pci_dev *pdev = dd->pcidev;
+       u32 hit_error = 0;
+       u32 violation;
+       u32 i;
+       u8 c_minus1, c0, c_plus1;
+
+       for (i = 0; i < 11; i++) {
+               /* set index */
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+               /* write the value */
+               c_minus1 = eq[i][PREC] / div;
+               c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+               c_plus1 = eq[i][POST] / div;
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+                                      eq_value(c_minus1, c0, c_plus1));
+               /* check if these coefficients violate EQ rules */
+               pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+                                     &violation);
+               if (violation
+                   & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+                       if (hit_error == 0) {
+                               dd_dev_err(dd,
+                                          "Gen3 EQ Table Coefficient rule violations\n");
+                               dd_dev_err(dd, "         prec   attn   post\n");
+                       }
+                       dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
+                                  i, (u32)eq[i][0], (u32)eq[i][1],
+                                  (u32)eq[i][2]);
+                       dd_dev_err(dd, "            %02x     %02x     %02x\n",
+                                  (u32)c_minus1, (u32)c0, (u32)c_plus1);
+                       hit_error = 1;
+               }
+       }
+       if (hit_error)
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The SBus resource is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+       int i;
+
+       set_sbus_fast_mode(dd);
+       /*
+        * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+        * This avoids a spurious framing error that can otherwise be
+        * generated by the MAC layer.
+        *
+        * Use individual addresses since no broadcast is set up.
+        */
+       for (i = 0; i < NUM_PCIE_SERDES; i++) {
+               sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+                            0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+       }
+
+       clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+       struct pci_dev *dev = dd->pcidev;
+       struct pci_dev *pdev;
+
+       /* need a parent */
+       if (!dev->bus->self) {
+               dd_dev_err(dd, "%s: no parent device\n", __func__);
+               return -ENOTTY;
+       }
+
+       /* should not be anyone else on the bus */
+       list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+               if (pdev != dev) {
+                       dd_dev_err(dd,
+                                  "%s: another device is on the same bus\n",
+                                  __func__);
+                       return -ENOTTY;
+               }
+
+       /*
+        * A secondary bus reset (SBR) issues a hot reset to our device.
+        * The following routine does a 1s wait after the reset is dropped
+        * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
+        * Conventional Reset, paragraph 3, line 35 also says that a 1s
+        * delay after a reset is required.  Per spec requirements,
+        * the link is either working or not after that point.
+        */
+       pci_reset_bridge_secondary_bus(dev->bus->self);
+
+       return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+                                  u16 code, u16 data)
+{
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+                 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
+                  ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = (((u64)1 << dd->hfi1_id) <<
+              ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
+             ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
+              ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
+              ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
+              ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
+              ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
+       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+       /* read back to push the write */
+       read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * CCE_PCIE_CTRL long name helpers
+ * We redefine these shorter macros to use in the code while leaving
+ * chip_registers.h to be autogenerated from the hardware spec.
+ */
+#define LANE_BUNDLE_MASK              CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
+#define LANE_BUNDLE_SHIFT             CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
+#define LANE_DELAY_MASK               CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
+#define LANE_DELAY_SHIFT              CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
+#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_SHIFT                  CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
+#define MARGIN_G1_G2_OVERWRITE_MASK   CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
+#define MARGIN_G1_G2_OVERWRITE_SHIFT  CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_GEN1_GEN2_MASK         CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
+#define MARGIN_GEN1_GEN2_SHIFT        CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
+
+ /*
+  * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
+  */
+static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
+{
+       u64 pcie_ctrl;
+       u64 xmt_margin;
+       u64 xmt_margin_oe;
+       u64 lane_delay;
+       u64 lane_bundle;
+
+       pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
+
+       /*
+        * For Discrete, use full-swing.
+        *  - PCIe TX defaults to full-swing.
+        *    Leave this register as default.
+        * For Integrated, use half-swing
+        *  - Copy xmt_margin and xmt_margin_oe
+        *    from Gen1/Gen2 to Gen3.
+        */
+       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
+               /* extract initial fields */
+               xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
+                             & MARGIN_GEN1_GEN2_MASK;
+               xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
+                                & MARGIN_G1_G2_OVERWRITE_MASK;
+               lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
+               lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
+                              & LANE_BUNDLE_MASK;
+
+               /*
+                * For A0, EFUSE values are not set.  Override with the
+                * correct values.
+                */
+               if (is_ax(dd)) {
+                       /*
+                        * xmt_margin and OverwiteEnabel should be the
+                        * same for Gen1/Gen2 and Gen3
+                        */
+                       xmt_margin = 0x5;
+                       xmt_margin_oe = 0x1;
+                       lane_delay = 0xF; /* Delay 240ns. */
+                       lane_bundle = 0x0; /* Set to 1 lane. */
+               }
+
+               /* overwrite existing values */
+               pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
+                       | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
+                       | (xmt_margin << MARGIN_SHIFT)
+                       | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
+                       | (lane_delay << LANE_DELAY_SHIFT)
+                       | (lane_bundle << LANE_BUNDLE_SHIFT);
+
+               write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
+       }
+
+       dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
+                  fname, pcie_ctrl);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+       u64 fw_ctrl;
+       u64 reg, therm;
+       u32 reg32, fs, lf;
+       u32 status, err;
+       int ret;
+       int do_retry, retry_count = 0;
+       uint default_pset;
+       u16 target_vector, target_speed;
+       u16 lnkctl2, vendor;
+       u8 div;
+       const u8 (*eq)[3];
+       int return_error = 0;
+
+       /* PCIe Gen3 is for the ASIC only */
+       if (dd->icode != ICODE_RTL_SILICON)
+               return 0;
+
+       if (pcie_target == 1) {                 /* target Gen1 */
+               target_vector = GEN1_SPEED_VECTOR;
+               target_speed = 2500;
+       } else if (pcie_target == 2) {          /* target Gen2 */
+               target_vector = GEN2_SPEED_VECTOR;
+               target_speed = 5000;
+       } else if (pcie_target == 3) {          /* target Gen3 */
+               target_vector = GEN3_SPEED_VECTOR;
+               target_speed = 8000;
+       } else {
+               /* off or invalid target - skip */
+               dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+               return 0;
+       }
+
+       /* if already at target speed, done (unless forced) */
+       if (dd->lbus_speed == target_speed) {
+               dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+                           pcie_target,
+                           pcie_force ? "re-doing anyway" : "skipping");
+               if (!pcie_force)
+                       return 0;
+       }
+
+       /*
+        * The driver cannot do the transition if it has no access to the
+        * upstream component
+        */
+       if (!parent) {
+               dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
+                           __func__);
+               return 0;
+       }
+
+       /*
+        * Do the Gen3 transition.  Steps are those of the PCIe Gen3
+        * recipe.
+        */
+
+       /* step 1: pcie link working in gen1/gen2 */
+
+       /* step 2: if either side is not capable of Gen3, done */
+       if (pcie_target == 3 && !dd->link_gen3_capable) {
+               dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+               ret = -ENOSYS;
+               goto done_no_mutex;
+       }
+
+       /* hold the SBus resource across the firmware download and SBR */
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
+                          __func__);
+               return ret;
+       }
+
+       /* make sure thermal polling is not causing interrupts */
+       therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+               msleep(100);
+               dd_dev_info(dd, "%s: Disabled therm polling\n",
+                           __func__);
+       }
+
+retry:
+       /* the SBus download will reset the spico for thermal */
+
+       /* step 3: download SBus Master firmware */
+       /* step 4: download PCIe Gen3 SerDes firmware */
+       dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+       ret = load_pcie_firmware(dd);
+       if (ret) {
+               /* do not proceed if the firmware cannot be downloaded */
+               return_error = 1;
+               goto done;
+       }
+
+       /* step 5: set up device parameter settings */
+       dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+       /*
+        * PcieCfgSpcie1 - Link Control 3
+        * Leave at reset value.  No need to set PerfEq - link equalization
+        * will be performed automatically after the SBR when the target
+        * speed is 8GT/s.
+        */
+
+       /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+       /* step 5a: Set Synopsys Port Logic registers */
+
+       /*
+        * PcieCfgRegPl2 - Port Force Link
+        *
+        * Set the low power field to 0x10 to avoid unnecessary power
+        * management messages.  All other fields are zero.
+        */
+       reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+       /*
+        * PcieCfgRegPl100 - Gen3 Control
+        *
+        * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+        * turn on PcieCfgRegPl100.EqEieosCnt
+        * Everything else zero.
+        */
+       reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+       /*
+        * PcieCfgRegPl101 - Gen3 EQ FS and LF
+        * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+        * PcieCfgRegPl103 - Gen3 EQ Preset Index
+        * PcieCfgRegPl105 - Gen3 EQ Status
+        *
+        * Give initial EQ settings.
+        */
+       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+               /* 1000mV, FS=24, LF = 8 */
+               fs = 24;
+               lf = 8;
+               div = 3;
+               eq = discrete_preliminary_eq;
+               default_pset = DEFAULT_DISCRETE_PSET;
+       } else {
+               /* 400mV, FS=29, LF = 9 */
+               fs = 29;
+               lf = 9;
+               div = 1;
+               eq = integrated_preliminary_eq;
+               default_pset = DEFAULT_MCP_PSET;
+       }
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+                              (fs <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
+                              (lf <<
+                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+       ret = load_eq_table(dd, eq, fs, div);
+       if (ret)
+               goto done;
+
+       /*
+        * PcieCfgRegPl106 - Gen3 EQ Control
+        *
+        * Set Gen3EqPsetReqVec, leave other fields 0.
+        */
+       if (pcie_pset == UNSET_PSET)
+               pcie_pset = default_pset;
+       if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
+               dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+                          __func__, pcie_pset, default_pset);
+               pcie_pset = default_pset;
+       }
+       dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+                              ((1 << pcie_pset) <<
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
+                       PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+       /*
+        * step 5b: Do post firmware download steps via SBus
+        */
+       dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+       pcie_post_steps(dd);
+
+       /*
+        * step 5c: Program gasket interrupts
+        */
+       /* set the Rx Bit Rate to REFCLK ratio */
+       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+       /* disable pCal for PCIe Gen3 RX equalization */
+       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+       /*
+        * Enable iCal for PCIe Gen3 RX equalization, and set which
+        * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+        */
+       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+       /* terminate list */
+       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+
+       /*
+        * step 5d: program XMT margin
+        */
+       write_xmt_margin(dd, __func__);
+
+       /*
+        * step 5e: disable active state power management (ASPM). It
+        * will be enabled if required later
+        */
+       dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+       aspm_hw_disable_l1(dd);
+
+       /*
+        * step 5f: clear DirectSpeedChange
+        * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+        * change in the speed target from starting before we are ready.
+        * This field defaults to 0 and we are not changing it, so nothing
+        * needs to be done.
+        */
+
+       /* step 5g: Set target link speed */
+       /*
+        * Set target link speed to be target on both device and parent.
+        * On setting the parent: Some system BIOSs "helpfully" set the
+        * parent target speed to Gen2 to match the ASIC's initial speed.
+        * We can set the target Gen3 because we have already checked
+        * that it is Gen3 capable earlier.
+        */
+       dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+       pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       /* only write to parent if target is not as high as ours */
+       if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+               lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+               lnkctl2 |= target_vector;
+               dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+                           (u32)lnkctl2);
+               pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+       } else {
+               dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+       }
+
+       dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+       lnkctl2 |= target_vector;
+       dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+                   (u32)lnkctl2);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+       /* step 5h: arm gasket logic */
+       /* hold DC in reset across the SBR */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+       /* save firmware control across the SBR */
+       fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+       dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+       arm_gasket_logic(dd);
+
+       /*
+        * step 6: quiesce PCIe link
+        * The chip has already been reset, so there will be no traffic
+        * from the chip.  Linux has no easy way to enforce that it will
+        * not try to access the device, so we just need to hope it doesn't
+        * do it while we are doing the reset.
+        */
+
+       /*
+        * step 7: initiate the secondary bus reset (SBR)
+        * step 8: hardware brings the links back up
+        * step 9: wait for link speed transition to be complete
+        */
+       dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+       ret = trigger_sbr(dd);
+       if (ret)
+               goto done;
+
+       /* step 10: decide what to do next */
+
+       /* check if we can read PCI space */
+       ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+       if (ret) {
+               dd_dev_info(dd,
+                           "%s: read of VendorID failed after SBR, err %d\n",
+                           __func__, ret);
+               return_error = 1;
+               goto done;
+       }
+       if (vendor == 0xffff) {
+               dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+               return_error = 1;
+               ret = -EIO;
+               goto done;
+       }
+
+       /* restore PCI space registers we know were reset */
+       dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+       restore_pci_variables(dd);
+       /* restore firmware control */
+       write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+       /*
+        * Check the gasket block status.
+        *
+        * This is the first CSR read after the SBR.  If the read returns
+        * all 1s (fails), the link did not make it back.
+        *
+        * Once we're sure we can read and write, clear the DC reset after
+        * the SBR.  Then check for any per-lane errors. Then look over
+        * the status.
+        */
+       reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+       dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+       if (reg == ~0ull) {     /* PCIe read failed/timeout */
+               dd_dev_err(dd, "SBR failed - unable to read from device\n");
+               return_error = 1;
+               ret = -ENOSYS;
+               goto done;
+       }
+
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+
+       /* Set the LED off */
+       setextled(dd, 0);
+
+       /* check for any per-lane errors */
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+       dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+       /* extract status, look for our HFI */
+       status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+                       & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+       if ((status & (1 << dd->hfi1_id)) == 0) {
+               dd_dev_err(dd,
+                          "%s: gasket status 0x%x, expecting 0x%x\n",
+                          __func__, status, 1 << dd->hfi1_id);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* extract error */
+       err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+               & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+       if (err) {
+               dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* update our link information cache */
+       update_lbus_info(dd);
+       dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+                   dd->lbus_info);
+
+       if (dd->lbus_speed != target_speed) { /* not target */
+               /* maybe retry */
+               do_retry = retry_count < pcie_retry;
+               dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+                          pcie_target, do_retry ? ", retrying" : "");
+               retry_count++;
+               if (do_retry) {
+                       msleep(100); /* allow time to settle */
+                       goto retry;
+               }
+               ret = -EIO;
+       }
+
+done:
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+               msleep(100);
+               dd_dev_info(dd, "%s: Re-enable therm polling\n",
+                           __func__);
+       }
+       release_chip_resource(dd, CR_SBUS);
+done_no_mutex:
+       /* return no error if it is OK to be at current speed */
+       if (ret && !return_error) {
+               dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+               ret = 0;
+       }
+
+       dd_dev_info(dd, "%s: done\n", __func__);
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
new file mode 100644 (file)
index 0000000..d5edb1a
--- /dev/null
@@ -0,0 +1,2072 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear.  Use the provided
+ * sendctrl register.  This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+       write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+       while (1) {
+               udelay(1);
+               sendctrl = read_csr(dd, SEND_CTRL);
+               if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+                       break;
+       }
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+               << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+       u64 reg, mask;
+       unsigned long flags;
+       int write = 1;  /* write sendctrl back */
+       int flush = 0;  /* re-read sendctrl to make sure it is flushed */
+
+       spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+       reg = read_csr(dd, SEND_CTRL);
+       switch (op) {
+       case PSC_GLOBAL_ENABLE:
+               reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+       /* Fall through */
+       case PSC_DATA_VL_ENABLE:
+               /* Disallow sending on VLs not enabled */
+               mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
+                               SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+               reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+               break;
+       case PSC_GLOBAL_DISABLE:
+               reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_ENABLE:
+               reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_DISABLE:
+               reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_CM_RESET:
+               __cm_reset(dd, reg);
+               write = 0; /* CSR already written (and flushed) */
+               break;
+       case PSC_DATA_VL_DISABLE:
+               reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+               flush = 1;
+               break;
+       default:
+               dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+               break;
+       }
+
+       if (write) {
+               write_csr(dd, SEND_CTRL, reg);
+               if (flush)
+                       (void)read_csr(dd, SEND_CTRL); /* flush write */
+       }
+
+       spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU  -2
+#define SCC_PER_KRCVQ  -3
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102   /* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+       [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_VL },  /* one per NUMA */
+       [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
+                       .count = SCC_PER_KRCVQ },
+       [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_CPU }, /* one per CPU */
+       [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+                       .count = 1 },
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+       int centipercent;       /* % of memory, in 100ths of 1% */
+       int absolute_blocks;    /* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+       /* centi%, abs blocks */
+       {  10000,     -1 },             /* pool 0 */
+       {      0,     -1 },             /* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+       int centipercent;       /*
+                                * 100th of 1% of memory to use, -1 if blocks
+                                * already set
+                                */
+       int count;              /* count of contexts in the pool */
+       int blocks;             /* block size of the pool */
+       int size;               /* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index.  The wildcards
+ * start at -1 and increase negatively.  Map them as:
+ *     -1 => 0
+ *     -2 => 1
+ *     etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+       if (wc >= 0)
+               return -1;      /* non-wildcard */
+       return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+       "kernel",
+       "ack",
+       "user",
+       "vl15"
+};
+
+static const char *sc_type_name(int index)
+{
+       if (index < 0 || index >= SC_MAX)
+               return "unknown";
+       return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration.  Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+       struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+       int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+       int total_contexts = 0;
+       int fixed_blocks;
+       int pool_blocks;
+       int used_blocks;
+       int cp_total;           /* centipercent total */
+       int ab_total;           /* absolute block total */
+       int extra;
+       int i;
+
+       /*
+        * When SDMA is enabled, kernel context pio packet size is capped by
+        * "piothreshold". Reduce pio buffer allocation for kernel context by
+        * setting it to a fixed size. The allocation allows 3-deep buffering
+        * of the largest pio packets plus up to 128 bytes header, sufficient
+        * to maintain verbs performance.
+        *
+        * When SDMA is disabled, keep the default pooling allocation.
+        */
+       if (HFI1_CAP_IS_KSET(SDMA)) {
+               u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+                                        piothreshold : PIO_THRESHOLD_CEILING;
+               sc_config_sizes[SC_KERNEL].size =
+                       3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+       }
+
+       /*
+        * Step 0:
+        *      - copy the centipercents/absolute sizes from the pool config
+        *      - sanity check these values
+        *      - add up centipercents, then later check for full value
+        *      - add up absolute blocks, then later check for over-commit
+        */
+       cp_total = 0;
+       ab_total = 0;
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               int cp = sc_mem_pool_config[i].centipercent;
+               int ab = sc_mem_pool_config[i].absolute_blocks;
+
+               /*
+                * A negative value is "unused" or "invalid".  Both *can*
+                * be valid, but centipercent wins, so check that first
+                */
+               if (cp >= 0) {                  /* centipercent valid */
+                       cp_total += cp;
+               } else if (ab >= 0) {           /* absolute blocks valid */
+                       ab_total += ab;
+               } else {                        /* neither valid */
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d: both the block count and centipercent are invalid\n",
+                               i);
+                       return -EINVAL;
+               }
+
+               mem_pool_info[i].centipercent = cp;
+               mem_pool_info[i].blocks = ab;
+       }
+
+       /* do not use both % and absolute blocks for different pools */
+       if (cp_total != 0 && ab_total != 0) {
+               dd_dev_err(
+                       dd,
+                       "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+               return -EINVAL;
+       }
+
+       /* if any percentages are present, they must add up to 100% x 100 */
+       if (cp_total != 0 && cp_total != 10000) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool centipercent is %d, expecting 10000\n",
+                       cp_total);
+               return -EINVAL;
+       }
+
+       /* the absolute pool total cannot be more than the mem total */
+       if (ab_total > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool absolute block count %d is larger than the memory size %d\n",
+                       ab_total, total_blocks);
+               return -EINVAL;
+       }
+
+       /*
+        * Step 2:
+        *      - copy from the context size config
+        *      - replace context type wildcard counts with real values
+        *      - add up non-memory pool block sizes
+        *      - add up memory pool user counts
+        */
+       fixed_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               int count = sc_config_sizes[i].count;
+               int size = sc_config_sizes[i].size;
+               int pool;
+
+               /*
+                * Sanity check count: Either a positive value or
+                * one of the expected wildcards is valid.  The positive
+                * value is checked later when we compare against total
+                * memory available.
+                */
+               if (i == SC_ACK) {
+                       count = dd->n_krcv_queues;
+               } else if (i == SC_KERNEL) {
+                       count = INIT_SC_PER_VL * num_vls;
+               } else if (count == SCC_PER_CPU) {
+                       count = dd->num_rcv_contexts - dd->n_krcv_queues;
+               } else if (count < 0) {
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid count wildcard %d\n",
+                               sc_type_name(i), count);
+                       return -EINVAL;
+               }
+               if (total_contexts + count > dd->chip_send_contexts)
+                       count = dd->chip_send_contexts - total_contexts;
+
+               total_contexts += count;
+
+               /*
+                * Sanity check pool: The conversion will return a pool
+                * number or -1 if a fixed (non-negative) value.  The fixed
+                * value is checked later when we compare against
+                * total memory available.
+                */
+               pool = wildcard_to_pool(size);
+               if (pool == -1) {                       /* non-wildcard */
+                       fixed_blocks += size * count;
+               } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
+                       mem_pool_info[pool].count += count;
+               } else {                                /* invalid wildcard */
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid pool wildcard %d\n",
+                               sc_type_name(i), size);
+                       return -EINVAL;
+               }
+
+               dd->sc_sizes[i].count = count;
+               dd->sc_sizes[i].size = size;
+       }
+       if (fixed_blocks > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed block count, %u, larger than total block count %u\n",
+                       fixed_blocks, total_blocks);
+               return -EINVAL;
+       }
+
+       /* step 3: calculate the blocks in the pools, and pool context sizes */
+       pool_blocks = total_blocks - fixed_blocks;
+       if (ab_total > pool_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed pool sizes, %u, larger than pool block count %u\n",
+                       ab_total, pool_blocks);
+               return -EINVAL;
+       }
+       /* subtract off the fixed pool blocks */
+       pool_blocks -= ab_total;
+
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               struct mem_pool_info *pi = &mem_pool_info[i];
+
+               /* % beats absolute blocks */
+               if (pi->centipercent >= 0)
+                       pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+               if (pi->blocks == 0 && pi->count != 0) {
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d has %u contexts, but no blocks\n",
+                               i, pi->count);
+                       return -EINVAL;
+               }
+               if (pi->count == 0) {
+                       /* warn about wasted blocks */
+                       if (pi->blocks != 0)
+                               dd_dev_err(
+                                       dd,
+                                       "Send context memory pool %d has %u blocks, but zero contexts\n",
+                                       i, pi->blocks);
+                       pi->size = 0;
+               } else {
+                       pi->size = pi->blocks / pi->count;
+               }
+       }
+
+       /* step 4: fill in the context type sizes from the pool sizes */
+       used_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               if (dd->sc_sizes[i].size < 0) {
+                       unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+                       WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+                       dd->sc_sizes[i].size = mem_pool_info[pool].size;
+               }
+               /* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+               if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+                       dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+               /* calculate our total usage */
+               used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+       }
+       extra = total_blocks - used_blocks;
+       if (extra != 0)
+               dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+       return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+       u16 base;
+       int ret, i, j, context;
+
+       ret = init_credit_return(dd);
+       if (ret)
+               return ret;
+
+       dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+                                       GFP_KERNEL);
+       dd->send_contexts = kcalloc(dd->num_send_contexts,
+                                       sizeof(struct send_context_info),
+                                       GFP_KERNEL);
+       if (!dd->send_contexts || !dd->hw_to_sw) {
+               kfree(dd->hw_to_sw);
+               kfree(dd->send_contexts);
+               free_credit_return(dd);
+               return -ENOMEM;
+       }
+
+       /* hardware context map starts with invalid send context indices */
+       for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+               dd->hw_to_sw[i] = INVALID_SCI;
+
+       /*
+        * All send contexts have their credit sizes.  Allocate credits
+        * for each context one after another from the global space.
+        */
+       context = 0;
+       base = 1;
+       for (i = 0; i < SC_MAX; i++) {
+               struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+               for (j = 0; j < scs->count; j++) {
+                       struct send_context_info *sci =
+                                               &dd->send_contexts[context];
+                       sci->type = i;
+                       sci->base = base;
+                       sci->credits = scs->size;
+
+                       context++;
+                       base += scs->size;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+                      u32 *hw_context)
+{
+       struct send_context_info *sci;
+       u32 index;
+       u32 context;
+
+       for (index = 0, sci = &dd->send_contexts[0];
+                       index < dd->num_send_contexts; index++, sci++) {
+               if (sci->type == type && sci->allocated == 0) {
+                       sci->allocated = 1;
+                       /* use a 1:1 mapping, but make them non-equal */
+                       context = dd->chip_send_contexts - index - 1;
+                       dd->hw_to_sw[context] = index;
+                       *sw_index = index;
+                       *hw_context = context;
+                       return 0; /* success */
+               }
+       }
+       dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+       return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+       struct send_context_info *sci;
+
+       sci = &dd->send_contexts[sw_index];
+       if (!sci->allocated) {
+               dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+                          __func__, sw_index, hw_context);
+       }
+       sci->allocated = 0;
+       dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+       return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+       return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return.  One for each physical
+ *   send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ *   credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ *   with the same value.  Use the address of the first send context in the
+ *   group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+       u32 gc = group_context(sc->hw_context, sc->group);
+       u32 index = sc->hw_context & 0x7;
+
+       sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+       *pa = (unsigned long)
+              &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+       struct send_context *sc;
+
+       sc = container_of(work, struct send_context, halt_work);
+       sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+       u32 release_credits;
+       u32 threshold;
+
+       /* add in the header size, then divide by the PIO block size */
+       mtu += hdrqentsize << 2;
+       release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+       /* check against this context's credits */
+       if (sc->credits <= release_credits)
+               threshold = 1;
+       else
+               threshold = sc->credits - release_credits;
+
+       return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+       return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+       unsigned long flags;
+       u32 old_threshold;
+       int force_return = 0;
+
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+       old_threshold = (sc->credit_ctrl >>
+                               SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+                        & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+       if (new_threshold != old_threshold) {
+               sc->credit_ctrl =
+                       (sc->credit_ctrl
+                               & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+                       | ((new_threshold
+                               & SC(CREDIT_CTRL_THRESHOLD_MASK))
+                          << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+
+               /* force a credit return on change to avoid a possible stall */
+               force_return = 1;
+       }
+
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+       if (force_return)
+               sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg = 0;
+       u32 hw_context = sc->hw_context;
+       int type = sc->type;
+
+       /*
+        * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+        * we're snooping.
+        */
+       if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+           dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+               reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+static u32 get_buffers_allocated(struct send_context *sc)
+{
+       int cpu;
+       u32 ret = 0;
+
+       for_each_possible_cpu(cpu)
+               ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
+       return ret;
+}
+
+static void reset_buffers_allocated(struct send_context *sc)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa)
+{
+       struct send_context_info *sci;
+       struct send_context *sc = NULL;
+       dma_addr_t pa;
+       unsigned long flags;
+       u64 reg;
+       u32 thresh;
+       u32 sw_index;
+       u32 hw_context;
+       int ret;
+       u8 opval, opmask;
+
+       /* do not allocate while frozen */
+       if (dd->flags & HFI1_FROZEN)
+               return NULL;
+
+       sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
+       if (!sc)
+               return NULL;
+
+       sc->buffers_allocated = alloc_percpu(u32);
+       if (!sc->buffers_allocated) {
+               kfree(sc);
+               dd_dev_err(dd,
+                          "Cannot allocate buffers_allocated per cpu counters\n"
+                         );
+               return NULL;
+       }
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+       if (ret) {
+               spin_unlock_irqrestore(&dd->sc_lock, flags);
+               free_percpu(sc->buffers_allocated);
+               kfree(sc);
+               return NULL;
+       }
+
+       sci = &dd->send_contexts[sw_index];
+       sci->sc = sc;
+
+       sc->dd = dd;
+       sc->node = numa;
+       sc->type = type;
+       spin_lock_init(&sc->alloc_lock);
+       spin_lock_init(&sc->release_lock);
+       spin_lock_init(&sc->credit_ctrl_lock);
+       INIT_LIST_HEAD(&sc->piowait);
+       INIT_WORK(&sc->halt_work, sc_halted);
+       init_waitqueue_head(&sc->halt_wait);
+
+       /* grouping is always single context for now */
+       sc->group = 0;
+
+       sc->sw_index = sw_index;
+       sc->hw_context = hw_context;
+       cr_group_addresses(sc, &pa);
+       sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+       sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+                                       << PIO_ADDR_CONTEXT_SHIFT);
+
+       /* set base and credits */
+       reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+                                       << SC(CTRL_CTXT_DEPTH_SHIFT))
+               | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+                                       << SC(CTRL_CTXT_BASE_SHIFT));
+       write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+       set_pio_integrity(sc);
+
+       /* unmask all errors */
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+       /* set the default partition key */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+                       (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
+                        DEFAULT_PKEY) <<
+                       SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+       /* per context type checks */
+       if (type == SC_USER) {
+               opval = USER_OPCODE_CHECK_VAL;
+               opmask = USER_OPCODE_CHECK_MASK;
+       } else {
+               opval = OPCODE_CHECK_VAL_DISABLED;
+               opmask = OPCODE_CHECK_MASK_DISABLED;
+       }
+
+       /* set the send context check opcode mask and value */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+                       ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+                       ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+       /* set up credit return */
+       reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+       /*
+        * Calculate the initial credit return threshold.
+        *
+        * For Ack contexts, set a threshold for half the credits.
+        * For User contexts use the given percentage.  This has been
+        * sanitized on driver start-up.
+        * For Kernel contexts, use the default MTU plus a header
+        * or half the credits, whichever is smaller. This should
+        * work for both the 3-deep buffering allocation and the
+        * pooling allocation.
+        */
+       if (type == SC_ACK) {
+               thresh = sc_percent_to_threshold(sc, 50);
+       } else if (type == SC_USER) {
+               thresh = sc_percent_to_threshold(sc,
+                                                user_credit_return_threshold);
+       } else { /* kernel */
+               thresh = min(sc_percent_to_threshold(sc, 50),
+                            sc_mtu_to_threshold(sc, hfi1_max_mtu,
+                                                hdrqentsize));
+       }
+       reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+       /* add in early return */
+       if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+       else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+       /* set up write-through credit_ctrl */
+       sc->credit_ctrl = reg;
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+       /* User send contexts should not allow sending on VL15 */
+       if (type == SC_USER) {
+               reg = 1ULL << 15;
+               write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+       }
+
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       /*
+        * Allocate shadow ring to track outstanding PIO buffers _after_
+        * unlocking.  We don't know the size until the lock is held and
+        * we can't allocate while the lock is held.  No one is using
+        * the context yet, so allocate it now.
+        *
+        * User contexts do not get a shadow ring.
+        */
+       if (type != SC_USER) {
+               /*
+                * Size the shadow ring 1 larger than the number of credits
+                * so head == tail can mean empty.
+                */
+               sc->sr_size = sci->credits + 1;
+               sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+                               sc->sr_size, GFP_KERNEL, numa);
+               if (!sc->sr) {
+                       sc_free(sc);
+                       return NULL;
+               }
+       }
+
+       hfi1_cdbg(PIO,
+                 "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+                 sw_index,
+                 hw_context,
+                 sc_type_name(type),
+                 sc->group,
+                 sc->credits,
+                 sc->credit_ctrl,
+                 thresh);
+
+       return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       u32 sw_index;
+       u32 hw_context;
+
+       if (!sc)
+               return;
+
+       sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
+       dd = sc->dd;
+       if (!list_empty(&sc->piowait))
+               dd_dev_err(dd, "piowait list not empty!\n");
+       sw_index = sc->sw_index;
+       hw_context = sc->hw_context;
+       sc_disable(sc); /* make sure the HW is disabled */
+       flush_work(&sc->halt_work);
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       dd->send_contexts[sw_index].sc = NULL;
+
+       /* clear/disable all registers set in sc_alloc */
+       write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+       /* release the index and context for re-use */
+       sc_hw_free(dd, sw_index, hw_context);
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       kfree(sc->sr);
+       free_percpu(sc->buffers_allocated);
+       kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+       u64 reg;
+       unsigned long flags;
+       struct pio_buf *pbuf;
+
+       if (!sc)
+               return;
+
+       /* do all steps, even if already disabled */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+       reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+       sc->flags &= ~SCF_ENABLED;
+       sc_wait_for_packet_egress(sc, 1);
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /*
+        * Flush any waiters.  Once the context is disabled,
+        * credit return interrupts are stopped (although there
+        * could be one in-process when the context is disabled).
+        * Wait one microsecond for any lingering interrupts, then
+        * proceed with the flush.
+        */
+       udelay(1);
+       spin_lock_irqsave(&sc->release_lock, flags);
+       if (sc->sr) {   /* this context has a shadow ring */
+               while (sc->sr_tail != sc->sr_head) {
+                       pbuf = &sc->sr[sc->sr_tail].pbuf;
+                       if (pbuf->cb)
+                               (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+                       sc->sr_tail++;
+                       if (sc->sr_tail >= sc->sr_size)
+                               sc->sr_tail = 0;
+               }
+       }
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+       (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+       >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+       ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return  */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg = 0;
+       u64 reg_prev;
+       u32 loop = 0;
+
+       while (1) {
+               reg_prev = reg;
+               reg = read_csr(dd, sc->hw_context * 8 +
+                              SEND_EGRESS_CTXT_STATUS);
+               /* done if egress is stopped */
+               if (egress_halted(reg))
+                       break;
+               reg = packet_occupancy(reg);
+               if (reg == 0)
+                       break;
+               /* counter is reset if occupancy count changes */
+               if (reg != reg_prev)
+                       loop = 0;
+               if (loop > 500) {
+                       /* timed out - bounce the link */
+                       dd_dev_err(dd,
+                                  "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+                                  __func__, sc->sw_index,
+                                  sc->hw_context, (u32)reg);
+                       queue_work(dd->pport->hfi1_wq,
+                                  &dd->pport->link_bounce_work);
+                       break;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       if (pause)
+               /* Add additional delay to ensure chip returns all credits */
+               pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               struct send_context *sc = dd->send_contexts[i].sc;
+
+               if (!sc)
+                       continue;
+               sc_wait_for_packet_egress(sc, 0);
+       }
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg;
+       u32 loop;
+       int count;
+
+       /* bounce off if not halted, or being free'd */
+       if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+               return -EINVAL;
+
+       dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+                   sc->hw_context);
+
+       /*
+        * Step 1: Wait for the context to actually halt.
+        *
+        * The error interrupt is asynchronous to actually setting halt
+        * on the context.
+        */
+       loop = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+               if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+                       break;
+               if (loop > 100) {
+                       dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+                                  __func__, sc->sw_index, sc->hw_context);
+                       return -ETIME;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       /*
+        * Step 2: Ensure no users are still trying to write to PIO.
+        *
+        * For kernel contexts, we have already turned off buffer allocation.
+        * Now wait for the buffer count to go to zero.
+        *
+        * For user contexts, the user handling code has cut off write access
+        * to the context's PIO pages before calling this routine and will
+        * restore write access after this routine returns.
+        */
+       if (sc->type != SC_USER) {
+               /* kernel context */
+               loop = 0;
+               while (1) {
+                       count = get_buffers_allocated(sc);
+                       if (count == 0)
+                               break;
+                       if (loop > 100) {
+                               dd_dev_err(dd,
+                                          "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+                                          __func__, sc->sw_index,
+                                          sc->hw_context, count);
+                       }
+                       loop++;
+                       udelay(1);
+               }
+       }
+
+       /*
+        * Step 3: Wait for all packets to egress.
+        * This is done while disabling the send context
+        *
+        * Step 4: Disable the context
+        *
+        * This is a superset of the halt.  After the disable, the
+        * errors can be cleared.
+        */
+       sc_disable(sc);
+
+       /*
+        * Step 5: Enable the context
+        *
+        * This enable will clear the halted flag and per-send context
+        * error flags.
+        */
+       return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing.  To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them.  The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               /*
+                * Don't disable unallocated, unfrozen, or user send contexts.
+                * User send contexts will be disabled when the process
+                * calls into the driver to reset its context.
+                */
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               /* only need to disable, the context is already stopped */
+               sc_disable(sc);
+       }
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts.  The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared.  Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               sc_enable(sc);  /* will clear the sc frozen flag */
+       }
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ *     -ETIMEDOUT - if we wait too long
+ *     -EIO       - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int max, count = 0;
+
+       /* max is the longest possible HW init time / delay */
+       max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+       while (1) {
+               reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+               if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+                       break;
+               if (count >= max)
+                       return -ETIMEDOUT;
+               udelay(5);
+               count++;
+       }
+
+       return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state.  Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       /* make sure the init engine is not busy */
+       ret = pio_init_wait_progress(dd);
+       /* ignore any timeout */
+       if (ret == -EIO) {
+               /* clear the error */
+               write_csr(dd, SEND_PIO_ERR_CLEAR,
+                         SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+       }
+
+       /* reset init all */
+       write_csr(dd, SEND_PIO_INIT_CTXT,
+                 SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       if (ret < 0) {
+               dd_dev_err(dd,
+                          "PIO send context init %s while initializing all PIO blocks\n",
+                          ret == -ETIMEDOUT ? "is stuck" : "had an error");
+       }
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+       u64 sc_ctrl, reg, pio;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!sc)
+               return -EINVAL;
+       dd = sc->dd;
+
+       /*
+        * Obtain the allocator lock to guard against any allocation
+        * attempts (which should not happen prior to context being
+        * enabled). On the release/disable side we don't need to
+        * worry about locking since the releaser will not do anything
+        * if the context accounting values have not changed.
+        */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+               goto unlock; /* already enabled */
+
+       /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+       *sc->hw_free = 0;
+       sc->free = 0;
+       sc->alloc_free = 0;
+       sc->fill = 0;
+       sc->sr_head = 0;
+       sc->sr_tail = 0;
+       sc->flags = 0;
+       /* the alloc lock insures no fast path allocation */
+       reset_buffers_allocated(sc);
+
+       /*
+        * Clear all per-context errors.  Some of these will be set when
+        * we are re-enabling after a context halt.  Now that the context
+        * is disabled, the halt will not clear until after the PIO init
+        * engine runs below.
+        */
+       reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+       if (reg)
+               write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
+
+       /*
+        * The HW PIO initialization engine can handle only one init
+        * request at a time. Serialize access to each device's engine.
+        */
+       spin_lock(&dd->sc_init_lock);
+       /*
+        * Since access to this code block is serialized and
+        * each access waits for the initialization to complete
+        * before releasing the lock, the PIO initialization engine
+        * should not be in use, so we don't have to wait for the
+        * InProgress bit to go down.
+        */
+       pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+              SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+               SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+       write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+       /*
+        * Wait until the engine is done.  Give the chip the required time
+        * so, hopefully, we read the register just once.
+        */
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       spin_unlock(&dd->sc_init_lock);
+       if (ret) {
+               dd_dev_err(dd,
+                          "sctxt%u(%u): Context not enabled due to init failure %d\n",
+                          sc->sw_index, sc->hw_context, ret);
+               goto unlock;
+       }
+
+       /*
+        * All is well. Enable the context.
+        */
+       sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+       write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+       /*
+        * Read SendCtxtCtrl to force the write out and prevent a timing
+        * hazard where a PIO write may reach the context before the enable.
+        */
+       read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       sc->flags |= SCF_ENABLED;
+
+unlock:
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       /* a 0->1 transition schedules a credit return */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+                       SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+       /*
+        * Ensure that the write is flushed and the credit return is
+        * scheduled. We care more about the 0 -> 1 transition.
+        */
+       read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+       /* set back to 0 for next time */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+                   __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ *     - mark the context as halted or frozen
+ *     - stop buffer allocations
+ *
+ * Called from the error interrupt.  Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+       unsigned long flags;
+
+       /* mark the context */
+       sc->flags |= flag;
+
+       /* stop buffer allocations */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc->flags &= ~SCF_ENABLED;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+       wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                               pio_release_cb cb, void *arg)
+{
+       struct pio_buf *pbuf = NULL;
+       unsigned long flags;
+       unsigned long avail;
+       unsigned long blocks = dwords_to_blocks(dw_len);
+       unsigned long start_fill;
+       int trycount = 0;
+       u32 head, next;
+
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       if (!(sc->flags & SCF_ENABLED)) {
+               spin_unlock_irqrestore(&sc->alloc_lock, flags);
+               goto done;
+       }
+
+retry:
+       avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+       if (blocks > avail) {
+               /* not enough room */
+               if (unlikely(trycount)) { /* already tried to get more room */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       goto done;
+               }
+               /* copy from receiver cache line and recalculate */
+               sc->alloc_free = ACCESS_ONCE(sc->free);
+               avail =
+                       (unsigned long)sc->credits -
+                       (sc->fill - sc->alloc_free);
+               if (blocks > avail) {
+                       /* still no room, actively update */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       sc_release_update(sc);
+                       spin_lock_irqsave(&sc->alloc_lock, flags);
+                       sc->alloc_free = ACCESS_ONCE(sc->free);
+                       trycount++;
+                       goto retry;
+               }
+       }
+
+       /* there is enough room */
+
+       preempt_disable();
+       this_cpu_inc(*sc->buffers_allocated);
+
+       /* read this once */
+       head = sc->sr_head;
+
+       /* "allocate" the buffer */
+       start_fill = sc->fill;
+       sc->fill += blocks;
+
+       /*
+        * Fill the parts that the releaser looks at before moving the head.
+        * The only necessary piece is the sent_at field.  The credits
+        * we have just allocated cannot have been returned yet, so the
+        * cb and arg will not be looked at for a "while".  Put them
+        * on this side of the memory barrier anyway.
+        */
+       pbuf = &sc->sr[head].pbuf;
+       pbuf->sent_at = sc->fill;
+       pbuf->cb = cb;
+       pbuf->arg = arg;
+       pbuf->sc = sc;  /* could be filled in at sc->sr init time */
+       /* make sure this is in memory before updating the head */
+
+       /* calculate next head index, do not store */
+       next = head + 1;
+       if (next >= sc->sr_size)
+               next = 0;
+       /*
+        * update the head - must be last! - the releaser can look at fields
+        * in pbuf once we move the head
+        */
+       smp_wmb();
+       sc->sr_head = next;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /* finish filling in the buffer outside the lock */
+       pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+                                                       * PIO_BLOCK_SIZE);
+       pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+       pbuf->end = sc->base_addr + pbuf->size;
+       pbuf->block_count = blocks;
+       pbuf->qw_written = 0;
+       pbuf->carry_bytes = 0;
+       pbuf->carry.val64 = 0;
+done:
+       return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap.  Avoid problems by implementing
+ * a count scheme that is enforced by a lock.  The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts.  This is managed by a count.  If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       sc->credit_intr_count++;
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts.  This is managed by a count.  Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       WARN_ON(sc->credit_intr_count == 0);
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       sc->credit_intr_count--;
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                               SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this.  All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+       if (needint)
+               sc_add_credit_return_intr(sc);
+       else
+               sc_del_credit_return_intr(sc);
+       trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+       if (needint) {
+               mmiowb();
+               sc_return_credits(sc);
+       }
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct list_head *list;
+       struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
+       struct rvt_qp *qp;
+       struct hfi1_qp_priv *priv;
+       unsigned long flags;
+       unsigned i, n = 0;
+
+       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+           dd->send_contexts[sc->sw_index].type != SC_VL15)
+               return;
+       list = &sc->piowait;
+       /*
+        * Note: checking that the piowait list is empty and clearing
+        * the buffer available interrupt needs to be atomic or we
+        * could end up with QPs on the wait list with the interrupt
+        * disabled.
+        */
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       while (!list_empty(list)) {
+               struct iowait *wait;
+
+               if (n == ARRAY_SIZE(qps))
+                       break;
+               wait = list_first_entry(list, struct iowait, list);
+               qp = iowait_to_qp(wait);
+               priv = qp->priv;
+               list_del_init(&priv->s_iowait.list);
+               /* refcount held until actual wake up */
+               qps[n++] = qp;
+       }
+       /*
+        * If there had been waiters and there are more
+        * insure that we redo the force to avoid a potential hang.
+        */
+       if (n) {
+               hfi1_sc_wantpiobuf_intr(sc, 0);
+               if (!list_empty(list))
+                       hfi1_sc_wantpiobuf_intr(sc, 1);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       for (i = 0; i < n; i++)
+               hfi1_qp_wakeup(qps[i],
+                              RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+       int code = 0;
+
+       if (hw_free & CR_STATUS_SMASK)
+               code |= PRC_STATUS_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+               code |= PRC_PBC;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+               code |= PRC_THRESHOLD;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+               code |= PRC_FILL_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+               code |= PRC_SC_DISABLE;
+       return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b)    /* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+       struct pio_buf *pbuf;
+       u64 hw_free;
+       u32 head, tail;
+       unsigned long old_free;
+       unsigned long free;
+       unsigned long extra;
+       unsigned long flags;
+       int code;
+
+       if (!sc)
+               return;
+
+       spin_lock_irqsave(&sc->release_lock, flags);
+       /* update free */
+       hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
+       old_free = sc->free;
+       extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+                       - (old_free & CR_COUNTER_MASK))
+                               & CR_COUNTER_MASK;
+       free = old_free + extra;
+       trace_hfi1_piofree(sc, extra);
+
+       /* call sent buffer callbacks */
+       code = -1;                              /* code not yet set */
+       head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
+       tail = sc->sr_tail;
+       while (head != tail) {
+               pbuf = &sc->sr[tail].pbuf;
+
+               if (sent_before(free, pbuf->sent_at)) {
+                       /* not sent yet */
+                       break;
+               }
+               if (pbuf->cb) {
+                       if (code < 0) /* fill in code on first user */
+                               code = fill_code(hw_free);
+                       (*pbuf->cb)(pbuf->arg, code);
+               }
+
+               tail++;
+               if (tail >= sc->sr_size)
+                       tail = 0;
+       }
+       sc->sr_tail = tail;
+       /* make sure tail is updated before free */
+       smp_wmb();
+       sc->free = free;
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+       sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser.  Argument is the send context that caused
+ * the interrupt.  Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler.  Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+       struct send_context *sc;
+       u32 sw_index;
+       u32 gc, gc_end;
+
+       spin_lock(&dd->sc_lock);
+       sw_index = dd->hw_to_sw[hw_context];
+       if (unlikely(sw_index >= dd->num_send_contexts)) {
+               dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+                          __func__, hw_context, sw_index);
+               goto done;
+       }
+       sc = dd->send_contexts[sw_index].sc;
+       if (unlikely(!sc))
+               goto done;
+
+       gc = group_context(hw_context, sc->group);
+       gc_end = gc + group_size(sc->group);
+       for (; gc < gc_end; gc++) {
+               sw_index = dd->hw_to_sw[gc];
+               if (unlikely(sw_index >= dd->num_send_contexts)) {
+                       dd_dev_err(dd,
+                                  "%s: invalid hw (%u) to sw (%u) mapping\n",
+                                  __func__, hw_context, sw_index);
+                       continue;
+               }
+               sc_release_update(dd->send_contexts[sw_index].sc);
+       }
+done:
+       spin_unlock(&dd->sc_lock);
+}
+
+/*
+ * pio_select_send_context_vl() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns a send context based on the selector and a vl.
+ * The mapping fields are protected by RCU
+ */
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+                                               u32 selector, u8 vl)
+{
+       struct pio_vl_map *m;
+       struct pio_map_elem *e;
+       struct send_context *rval;
+
+       /*
+        * NOTE This should only happen if SC->VL changed after the initial
+        * checks on the QP/AH
+        * Default will return VL0's send context below
+        */
+       if (unlikely(vl >= num_vls)) {
+               rval = NULL;
+               goto done;
+       }
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->pio_map);
+       if (unlikely(!m)) {
+               rcu_read_unlock();
+               return dd->vld[0].sc;
+       }
+       e = m->map[vl & m->mask];
+       rval = e->ksc[selector & e->mask];
+       rcu_read_unlock();
+
+done:
+       rval = !rval ? dd->vld[0].sc : rval;
+       return rval;
+}
+
+/*
+ * pio_select_send_context_sc() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ * This function returns an send context based on the selector and an sc
+ */
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+                                               u32 selector, u8 sc5)
+{
+       u8 vl = sc_to_vlt(dd, sc5);
+
+       return pio_select_send_context_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void pio_map_free(struct pio_vl_map *m)
+{
+       int i;
+
+       for (i = 0; m && i < m->actual_vls; i++)
+               kfree(m->map[i]);
+       kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void pio_map_rcu_callback(struct rcu_head *list)
+{
+       struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
+
+       pio_map_free(m);
+}
+
+/*
+ * pio_map_init - called when #vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_scontexts: per vl send context mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_scontexts is used to specify a non-uniform vl/send context
+ * loading. NULL implies auto computing the loading and giving each
+ * VL an uniform distribution of send contexts per VL.
+ *
+ * The auto algorithm computers the sc_per_vl and the number of extra
+ * send contexts. Any extra send contexts are added from the last VL
+ * on down
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_send_contexts are non-power of 2, the
+ * array sizes in the struct pio_vl_map and the struct pio_map_elem are
+ * rounded up to the next highest power of 2 and the first entry is
+ * reused in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is not
+ * chaged.
+ *
+ */
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
+{
+       int i, j;
+       int extra, sc_per_vl;
+       int scontext = 1;
+       int num_kernel_send_contexts = 0;
+       u8 lvl_scontexts[OPA_MAX_VLS];
+       struct pio_vl_map *oldmap, *newmap;
+
+       if (!vl_scontexts) {
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       if (dd->send_contexts[i].type == SC_KERNEL)
+                               num_kernel_send_contexts++;
+               /* truncate divide */
+               sc_per_vl = num_kernel_send_contexts / num_vls;
+               /* extras */
+               extra = num_kernel_send_contexts % num_vls;
+               vl_scontexts = lvl_scontexts;
+               /* add extras from last vl down */
+               for (i = num_vls - 1; i >= 0; i--, extra--)
+                       vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
+       }
+       /* build new map */
+       newmap = kzalloc(sizeof(*newmap) +
+                        roundup_pow_of_two(num_vls) *
+                        sizeof(struct pio_map_elem *),
+                        GFP_KERNEL);
+       if (!newmap)
+               goto bail;
+       newmap->actual_vls = num_vls;
+       newmap->vls = roundup_pow_of_two(num_vls);
+       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+       for (i = 0; i < newmap->vls; i++) {
+               /* save for wrap around */
+               int first_scontext = scontext;
+
+               if (i < newmap->actual_vls) {
+                       int sz = roundup_pow_of_two(vl_scontexts[i]);
+
+                       /* only allocate once */
+                       newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
+                                                sz * sizeof(struct
+                                                            send_context *),
+                                                GFP_KERNEL);
+                       if (!newmap->map[i])
+                               goto bail;
+                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+                       /* assign send contexts */
+                       for (j = 0; j < sz; j++) {
+                               if (dd->kernel_send_context[scontext])
+                                       newmap->map[i]->ksc[j] =
+                                       dd->kernel_send_context[scontext];
+                               if (++scontext >= first_scontext +
+                                                 vl_scontexts[i])
+                                       /* wrap back to first send context */
+                                       scontext = first_scontext;
+                       }
+               } else {
+                       /* just re-use entry without allocating */
+                       newmap->map[i] = newmap->map[i % num_vls];
+               }
+               scontext = first_scontext + vl_scontexts[i];
+       }
+       /* newmap in hand, save old map */
+       spin_lock_irq(&dd->pio_map_lock);
+       oldmap = rcu_dereference_protected(dd->pio_map,
+                                          lockdep_is_held(&dd->pio_map_lock));
+
+       /* publish newmap */
+       rcu_assign_pointer(dd->pio_map, newmap);
+
+       spin_unlock_irq(&dd->pio_map_lock);
+       /* success, free any old map after grace period */
+       if (oldmap)
+               call_rcu(&oldmap->list, pio_map_rcu_callback);
+       return 0;
+bail:
+       /* free any partial allocation */
+       pio_map_free(newmap);
+       return -ENOMEM;
+}
+
+void free_pio_map(struct hfi1_devdata *dd)
+{
+       /* Free PIO map if allocated */
+       if (rcu_access_pointer(dd->pio_map)) {
+               spin_lock_irq(&dd->pio_map_lock);
+               pio_map_free(rcu_access_pointer(dd->pio_map));
+               RCU_INIT_POINTER(dd->pio_map, NULL);
+               spin_unlock_irq(&dd->pio_map_lock);
+               synchronize_rcu();
+       }
+       kfree(dd->kernel_send_context);
+       dd->kernel_send_context = NULL;
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+       int i;
+       u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
+       u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
+       u32 ctxt;
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       dd->vld[15].sc = sc_alloc(dd, SC_VL15,
+                                 dd->rcd[0]->rcvhdrqentsize, dd->node);
+       if (!dd->vld[15].sc)
+               goto nomem;
+       hfi1_init_ctxt(dd->vld[15].sc);
+       dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+
+       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
+                                       sizeof(struct send_context *),
+                                       GFP_KERNEL, dd->node);
+       dd->kernel_send_context[0] = dd->vld[15].sc;
+
+       for (i = 0; i < num_vls; i++) {
+               /*
+                * Since this function does not deal with a specific
+                * receive context but we need the RcvHdrQ entry size,
+                * use the size from rcd[0]. It is guaranteed to be
+                * valid at this point and will remain the same for all
+                * receive contexts.
+                */
+               dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+                                        dd->rcd[0]->rcvhdrqentsize, dd->node);
+               if (!dd->vld[i].sc)
+                       goto nomem;
+               dd->kernel_send_context[i + 1] = dd->vld[i].sc;
+               hfi1_init_ctxt(dd->vld[i].sc);
+               /* non VL15 start with the max MTU */
+               dd->vld[i].mtu = hfi1_max_mtu;
+       }
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+               dd->kernel_send_context[i + 1] =
+               sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
+               if (!dd->kernel_send_context[i + 1])
+                       goto nomem;
+               hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
+       }
+
+       sc_enable(dd->vld[15].sc);
+       ctxt = dd->vld[15].sc->hw_context;
+       mask = all_vl_mask & ~(1LL << 15);
+       write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       dd_dev_info(dd,
+                   "Using send context %u(%u) for VL15\n",
+                   dd->vld[15].sc->sw_index, ctxt);
+
+       for (i = 0; i < num_vls; i++) {
+               sc_enable(dd->vld[i].sc);
+               ctxt = dd->vld[i].sc->hw_context;
+               mask = all_vl_mask & ~(data_vls_mask);
+               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       }
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+               sc_enable(dd->kernel_send_context[i + 1]);
+               ctxt = dd->kernel_send_context[i + 1]->hw_context;
+               mask = all_vl_mask & ~(data_vls_mask);
+               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       }
+
+       if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
+               goto nomem;
+       return 0;
+nomem:
+       sc_free(dd->vld[15].sc);
+       for (i = 0; i < num_vls; i++)
+               sc_free(dd->vld[i].sc);
+       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
+               sc_free(dd->kernel_send_context[i + 1]);
+       return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+       int ret;
+       int num_numa;
+       int i;
+
+       num_numa = num_online_nodes();
+       /* enforce the expectation that the numas are compact */
+       for (i = 0; i < num_numa; i++) {
+               if (!node_online(i)) {
+                       dd_dev_err(dd, "NUMA nodes are not compact\n");
+                       ret = -EINVAL;
+                       goto done;
+               }
+       }
+
+       dd->cr_base = kcalloc(
+               num_numa,
+               sizeof(struct credit_return_base),
+               GFP_KERNEL);
+       if (!dd->cr_base) {
+               dd_dev_err(dd, "Unable to allocate credit return base\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+       for (i = 0; i < num_numa; i++) {
+               int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+               set_dev_node(&dd->pcidev->dev, i);
+               dd->cr_base[i].va = dma_zalloc_coherent(
+                                       &dd->pcidev->dev,
+                                       bytes,
+                                       &dd->cr_base[i].pa,
+                                       GFP_KERNEL);
+               if (!dd->cr_base[i].va) {
+                       set_dev_node(&dd->pcidev->dev, dd->node);
+                       dd_dev_err(dd,
+                                  "Unable to allocate credit return DMA range for NUMA %d\n",
+                                  i);
+                       ret = -ENOMEM;
+                       goto done;
+               }
+       }
+       set_dev_node(&dd->pcidev->dev, dd->node);
+
+       ret = 0;
+done:
+       return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+       int num_numa;
+       int i;
+
+       if (!dd->cr_base)
+               return;
+
+       num_numa = num_online_nodes();
+       for (i = 0; i < num_numa; i++) {
+               if (dd->cr_base[i].va) {
+                       dma_free_coherent(&dd->pcidev->dev,
+                                         TXE_NUM_CONTEXTS *
+                                         sizeof(struct credit_return),
+                                         dd->cr_base[i].va,
+                                         dd->cr_base[i].pa);
+               }
+       }
+       kfree(dd->cr_base);
+       dd->cr_base = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
new file mode 100644 (file)
index 0000000..464cbd2
--- /dev/null
@@ -0,0 +1,328 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_VL15   1
+#define SC_ACK    2
+#define SC_USER   3    /* must be the last one: it may take all left */
+#define SC_MAX    4    /* count of send context types */
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK         0       /* no known error */
+#define PRC_STATUS_ERR 0x01    /* credit return due to status error */
+#define PRC_PBC                0x02    /* credit return due to PBC */
+#define PRC_THRESHOLD  0x04    /* credit return due to threshold */
+#define PRC_FILL_ERR   0x08    /* credit return due fill error */
+#define PRC_FORCE      0x10    /* credit return due credit force */
+#define PRC_SC_DISABLE 0x20    /* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+       u64 val64;
+       u32 val32[2];
+       u8  val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+       struct send_context *sc;/* back pointer to owning send context */
+       pio_release_cb cb;      /* called when the buffer is released */
+       void *arg;              /* argument for cb */
+       void __iomem *start;    /* buffer start address */
+       void __iomem *end;      /* context end address */
+       unsigned long size;     /* context size, in bytes */
+       unsigned long sent_at;  /* buffer is sent when <= free */
+       u32 block_count;        /* size of buffer, in blocks */
+       u32 qw_written;         /* QW written so far */
+       u32 carry_bytes;        /* number of valid bytes in carry */
+       union mix carry;        /* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+       struct pio_buf pbuf;
+       u64 unused[16];         /* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+       /* read-only after init */
+       struct hfi1_devdata *dd;                /* device */
+       void __iomem *base_addr;        /* start of PIO memory */
+       union pio_shadow_ring *sr;      /* shadow ring */
+
+       volatile __le64 *hw_free;       /* HW free counter */
+       struct work_struct halt_work;   /* halted context work queue entry */
+       unsigned long flags;            /* flags */
+       int node;                       /* context home node */
+       int type;                       /* context type */
+       u32 sw_index;                   /* software index number */
+       u32 hw_context;                 /* hardware context number */
+       u32 credits;                    /* number of blocks in context */
+       u32 sr_size;                    /* size of the shadow ring */
+       u32 group;                      /* credit return group */
+       /* allocator fields */
+       spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+       unsigned long fill;             /* official alloc count */
+       unsigned long alloc_free;       /* copy of free (less cache thrash) */
+       u32 sr_head;                    /* shadow ring head */
+       /* releaser fields */
+       spinlock_t release_lock ____cacheline_aligned_in_smp;
+       unsigned long free;             /* official free count */
+       u32 sr_tail;                    /* shadow ring tail */
+       /* list for PIO waiters */
+       struct list_head piowait  ____cacheline_aligned_in_smp;
+       spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+       u64 credit_ctrl;                /* cache for credit control */
+       u32 credit_intr_count;          /* count of credit intr users */
+       u32 __percpu *buffers_allocated;/* count of buffers allocated */
+       wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED  0x04
+#define SCF_FROZEN  0x08
+
+struct send_context_info {
+       struct send_context *sc;        /* allocated working context */
+       u16 allocated;                  /* has this been allocated? */
+       u16 type;                       /* context type */
+       u16 base;                       /* base in PIO array */
+       u16 credits;                    /* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+       volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+       struct credit_return *va;
+       dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+       short int size;
+       short int count;
+};
+
+/*
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform send contexts per vl, the
+ * number of send contexts for a vl is either the vl_scontexts[vl] or
+ * a computation based on num_kernel_send_contexts/num_vls:
+ *
+ * For example:
+ * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the send contexts are assigned
+ * in a round robin fashion wrapping back to the first send context
+ * for a particular vl.
+ *
+ *               dd->pio_map
+ *                    |                                   pio_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               pio_vl_map                            |--------------------|
+ *      +--------------------------+                   | ksc[0] -> sc 1     |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| ksc[1] -> sc 2     |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | ksc[n] -> sc n     |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | ksc[0] -> sc 1+n   |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| ksc[1] -> sc 2+n   |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | ksc[m] -> sc m+n   |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | ksc[0] -> sc 1+m+n |
+ *                                                  \- |--------------------|
+ *                                                    >| ksc[1] -> sc 2+m+n |
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | ksc[o] -> sc o+m+n |
+ *                                                     +--------------------+
+ *
+ */
+
+/* Initial number of send contexts per VL */
+#define INIT_SC_PER_VL 2
+
+/*
+ * struct pio_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @ksc - array of kernel send contexts for this vl
+ *
+ * The mask is used to "mod" the selector to
+ * produce index into the trailing array of
+ * kscs
+ */
+struct pio_map_elem {
+       u32 mask;
+       struct send_context *ksc[0];
+};
+
+/*
+ * struct pio_vl_map - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - numbers of vls rounded to next power of 2
+ * @map - array of pio_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing members of the
+ * struct point to pio_map_elem entries, which in turn point to an
+ * array of kscs for that vl.
+ */
+struct pio_vl_map {
+       struct rcu_head list;
+       u32 mask;
+       u8 actual_vls;
+       u8 vls;
+       struct pio_map_elem *map[0];
+};
+
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
+                u8 *vl_scontexts);
+void free_pio_map(struct hfi1_devdata *dd);
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+                                               u32 selector, u8 vl);
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+                                               u32 selector, u8 sc5);
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                               pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                       const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
new file mode 100644 (file)
index 0000000..8c25e1b
--- /dev/null
@@ -0,0 +1,867 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks.  The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* write the PBC */
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((count >> 1) * sizeof(u64));
+
+       if (dend < send) {
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* write dangling u32, if any */
+       if (count & 1) {
+               union mix val;
+
+               val.val64 = 0;
+               val.val32[0] = *(u32 *)from;
+               writeq(val.val64, dest);
+               dest += sizeof(u64);
+       }
+       /*
+        * fill in rest of block, no need to check pbuf->end
+        * as we only wrap on a block boundary
+        */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       this_cpu_dec(*pbuf->sc->buffers_allocated);
+       preempt_enable();
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes.  Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8 - (x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes.  Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry.  Other bytes are zeroed.  Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                 unsigned int nbytes)
+{
+       unsigned long off;
+
+       if (nbytes == 0) {
+               pbuf->carry.val64 = 0;
+       } else {
+               /* align our pointer */
+               off = (unsigned long)from & 0x7;
+               from = (void *)((unsigned long)from & ~0x7l);
+               pbuf->carry.val64 = ((*(u64 *)from)
+                               << zshift(nbytes + off))/* zero upper bytes */
+                               >> zshift(nbytes);      /* place at bottom */
+       }
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                   const void *from, unsigned int nbytes)
+{
+       unsigned long off = (unsigned long)from & 0x7;
+       unsigned int room, xbytes;
+
+       /* align our pointer */
+       from = (void *)((unsigned long)from & ~0x7l);
+
+       /* check count first - don't read anything if count is zero */
+       while (nbytes) {
+               /* find the number of bytes in this u64 */
+               room = 8 - off; /* this u64 has room for this many bytes */
+               xbytes = min(room, nbytes);
+
+               /*
+                * shift down to zero lower bytes, shift up to zero upper
+                * bytes, shift back down to move into place
+                */
+               pbuf->carry.val64 |= (((*(u64 *)from)
+                                       >> mshift(off))
+                                       << zshift(xbytes))
+                                       >> zshift(xbytes + pbuf->carry_bytes);
+               off = 0;
+               pbuf->carry_bytes += xbytes;
+               nbytes -= xbytes;
+               from += sizeof(u64);
+       }
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       unsigned int remaining;
+
+       if (zbytes == 0)        /* nothing to do */
+               return;
+
+       remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
+
+       /* NOTE: zshift only guaranteed to work if remaining != 0 */
+       if (remaining)
+               pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+                                       >> zshift(remaining);
+       else
+               pbuf->carry.val64 = 0;
+       pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void __iomem *dest,
+       const void *src)
+{
+       u64 new, temp;
+
+       new = *(u64 *)src;
+       temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+       writeq(temp, dest);
+       pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+       if (pbuf->carry_bytes) {
+               /* unused bytes are always kept zeroed, so just write */
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+       switch (n) {
+       case 7:
+               *dest++ = *src++;
+       case 6:
+               *dest++ = *src++;
+       case 5:
+               *dest++ = *src++;
+       case 4:
+               *dest++ = *src++;
+       case 3:
+               *dest++ = *src++;
+       case 2:
+               *dest++ = *src++;
+       case 1:
+               *dest++ = *src++;
+       }
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry.  Other bytes are left as-is.  Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                 unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[0], from, nbytes);
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                   const void *from, unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+       pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void *dest,
+       const void *src)
+{
+       u32 remainder = 8 - pbuf->carry_bytes;
+
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+       writeq(pbuf->carry.val64, dest);
+       jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+       if (pbuf->carry_bytes) {
+               u64 zero = 0;
+
+               jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+                     8 - pbuf->carry_bytes);
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                       const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+       if (dend < send) {
+               /*
+                * all QWORD data is within the SOP block, does *not*
+                * reach the end of the SOP block
+                */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* ...but it doesn't matter as we're done writing */
+
+       /* save dangling bytes, if any */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+       unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+       unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+       /* calculate 8-byte data end */
+       dend = dest + (qw_to_write * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = min(send, dend);
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               merge_write8(pbuf, dest, from);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* adjust carry */
+       if (pbuf->carry_bytes < bytes_left) {
+               /* need to read more */
+               read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+       } else {
+               /* remove invalid bytes */
+               zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+       }
+
+       pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+                             const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* calculate 8-byte data end */
+       dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /*
+                * calculate the end of data or end of block, whichever
+                * comes first
+                */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = min(send, dend);
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               writeq(*(u64 *)from, dest);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* we know carry_bytes was zero on entry to this routine */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written += nbytes >> 3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       unsigned long from_align = (unsigned long)from & 0x7;
+
+       if (pbuf->carry_bytes + nbytes < 8) {
+               /* not enough bytes to fill a QW */
+               read_extra_bytes(pbuf, from, nbytes);
+               return;
+       }
+
+       if (from_align) {
+               /* misaligned source pointer - align it */
+               unsigned long to_align;
+
+               /* bytes to read to align "from" */
+               to_align = 8 - from_align;
+
+               /*
+                * In the advance-to-alignment logic below, we do not need
+                * to check if we are using more than nbytes.  This is because
+                * if we are here, we already know that carry+nbytes will
+                * fill at least one QW.
+                */
+               if (pbuf->carry_bytes + to_align < 8) {
+                       /* not enough align bytes to fill a QW */
+                       read_extra_bytes(pbuf, from, to_align);
+                       from += to_align;
+                       nbytes -= to_align;
+               } else {
+                       /* bytes to fill carry */
+                       unsigned long to_fill = 8 - pbuf->carry_bytes;
+                       /* bytes left over to be read */
+                       unsigned long extra = to_align - to_fill;
+                       void __iomem *dest;
+
+                       /* fill carry... */
+                       read_extra_bytes(pbuf, from, to_fill);
+                       from += to_fill;
+                       nbytes -= to_fill;
+
+                       /* ...now write carry */
+                       dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+                       /*
+                        * The two checks immediately below cannot both be
+                        * true, hence the else.  If we have wrapped, we
+                        * cannot still be within the first block.
+                        * Conversely, if we are still in the first block, we
+                        * cannot have wrapped.  We do the wrap check first
+                        * as that is more likely.
+                        */
+                       /* adjust if we've wrapped */
+                       if (dest >= pbuf->end)
+                               dest -= pbuf->size;
+                       /* jump to SOP range if within the first block */
+                       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+                               dest += SOP_DISTANCE;
+
+                       carry8_write8(pbuf->carry, dest);
+                       pbuf->qw_written++;
+
+                       /* read any extra bytes to do final alignment */
+                       /* this will overwrite anything in pbuf->carry */
+                       read_low_bytes(pbuf, from, extra);
+                       from += extra;
+                       nbytes -= extra;
+               }
+
+               /* at this point, from is QW aligned */
+       }
+
+       if (pbuf->carry_bytes)
+               mid_copy_mix(pbuf, from, nbytes);
+       else
+               mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+       /*
+        * The two checks immediately below cannot both be true, hence the
+        * else.  If we have wrapped, we cannot still be within the first
+        * block.  Conversely, if we are still in the first block, we
+        * cannot have wrapped.  We do the wrap check first as that is
+        * more likely.
+        */
+       /* adjust if we have wrapped */
+       if (dest >= pbuf->end)
+               dest -= pbuf->size;
+       /* jump to the SOP range if within the first block */
+       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+               dest += SOP_DISTANCE;
+
+       /* write final bytes, if any */
+       if (carry_write8(pbuf, dest)) {
+               dest += sizeof(u64);
+               /*
+                * NOTE: We do not need to recalculate whether dest needs
+                * SOP_DISTANCE or not.
+                *
+                * If we are in the first block and the dangle write
+                * keeps us in the same block, dest will need
+                * to retain SOP_DISTANCE in the loop below.
+                *
+                * If we are in the first block and the dangle write pushes
+                * us to the next block, then loop below will not run
+                * and dest is not used.  Hence we do not need to update
+                * it.
+                *
+                * If we are past the first block, then SOP_DISTANCE
+                * was never added, so there is nothing to do.
+                */
+       }
+
+       /* fill in rest of block */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       this_cpu_dec(*pbuf->sc->buffers_allocated);
+       preempt_enable();
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
new file mode 100644 (file)
index 0000000..03df932
--- /dev/null
@@ -0,0 +1,909 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "efivar.h"
+
+void get_platform_config(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+       unsigned long size = 0;
+       u8 *temp_platform_config = NULL;
+
+       ret = read_hfi1_efi_var(dd, "configuration", &size,
+                               (void **)&temp_platform_config);
+       if (ret) {
+               dd_dev_info(dd,
+                           "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
+                           __func__);
+               /* fall back to request firmware */
+               platform_config_load = 1;
+               goto bail;
+       }
+
+       dd->platform_config.data = temp_platform_config;
+       dd->platform_config.size = size;
+
+bail:
+       /* exit */;
+}
+
+void free_platform_config(struct hfi1_devdata *dd)
+{
+       if (!platform_config_load) {
+               /*
+                * was loaded from EFI, release memory
+                * allocated by read_efi_var
+                */
+               kfree(dd->platform_config.data);
+       }
+       /*
+        * else do nothing, dispose_firmware will release
+        * struct firmware platform_config on driver exit
+        */
+}
+
+void get_port_type(struct hfi1_pportdata *ppd)
+{
+       int ret;
+
+       ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                                       PORT_TABLE_PORT_TYPE, &ppd->port_type,
+                                       4);
+       if (ret)
+               ppd->port_type = PORT_TYPE_UNKNOWN;
+}
+
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
+{
+       u8 tx_ctrl_byte = on ? 0x0 : 0xF;
+       int ret = 0;
+
+       ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
+                        &tx_ctrl_byte, 1);
+       /* we expected 1, so consider 0 an error */
+       if (ret == 0)
+               ret = -EIO;
+       else if (ret == 1)
+               ret = 0;
+       return ret;
+}
+
+static int qual_power(struct hfi1_pportdata *ppd)
+{
+       u32 cable_power_class = 0, power_class_max = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+       int ret = 0;
+
+       ret = get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+               SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
+       if (ret)
+               return ret;
+
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class > power_class_max)
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Port disabled due to system power restrictions\n",
+                       __func__);
+               ret = -EPERM;
+       }
+       return ret;
+}
+
+static int qual_bitrate(struct hfi1_pportdata *ppd)
+{
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
+           cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
+               ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+       if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
+           cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
+               ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Cable failed bitrate check, disabling port\n",
+                       __func__);
+               return -EPERM;
+       }
+       return 0;
+}
+
+static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
+{
+       u8 cable_power_class = 0, power_ctrl_byte = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+       int ret;
+
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class > QSFP_POWER_CLASS_1) {
+               power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
+
+               power_ctrl_byte |= 1;
+               power_ctrl_byte &= ~(0x2);
+
+               ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+                                QSFP_PWR_CTRL_BYTE_OFFS,
+                                &power_ctrl_byte, 1);
+               if (ret != 1)
+                       return -EIO;
+
+               if (cable_power_class > QSFP_POWER_CLASS_4) {
+                       power_ctrl_byte |= (1 << 2);
+                       ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+                                        QSFP_PWR_CTRL_BYTE_OFFS,
+                                        &power_ctrl_byte, 1);
+                       if (ret != 1)
+                               return -EIO;
+               }
+
+               /* SFF 8679 rev 1.7 LPMode Deassert time */
+               msleep(300);
+       }
+       return 0;
+}
+
+static void apply_rx_cdr(struct hfi1_pportdata *ppd,
+                        u32 rx_preset_index,
+                        u8 *cdr_ctrl_byte)
+{
+       u32 rx_preset;
+       u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
+
+       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
+             (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
+               return;
+
+       /* RX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn RX CDR on */
+               *cdr_ctrl_byte |= 0xF;
+               return;
+       }
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+               &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: RX_CDR_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
+               &rx_preset, 4);
+
+       /* Expand cdr setting to all 4 lanes */
+       rx_preset = (rx_preset | (rx_preset << 1) |
+                       (rx_preset << 2) | (rx_preset << 3));
+
+       if (rx_preset) {
+               *cdr_ctrl_byte |= rx_preset;
+       } else {
+               *cdr_ctrl_byte &= rx_preset;
+               /* Preserve current TX CDR status */
+               *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
+       }
+}
+
+static void apply_tx_cdr(struct hfi1_pportdata *ppd,
+                        u32 tx_preset_index,
+                        u8 *cdr_ctrl_byte)
+{
+       u32 tx_preset;
+       u8 *cache = ppd->qsfp_info.cache;
+       int cable_power_class;
+
+       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
+             (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
+               return;
+
+       /* TX CDR present, bypass supported */
+       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+       if (cable_power_class <= QSFP_POWER_CLASS_3) {
+               /* Power class <= 3, ignore config & turn TX CDR on */
+               *cdr_ctrl_byte |= 0xF0;
+               return;
+       }
+
+       get_platform_config_field(
+               ppd->dd,
+               PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+               TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
+
+       if (!tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX_CDR_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd,
+               PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index,
+               TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
+
+       /* Expand cdr setting to all 4 lanes */
+       tx_preset = (tx_preset | (tx_preset << 1) |
+                       (tx_preset << 2) | (tx_preset << 3));
+
+       if (tx_preset)
+               *cdr_ctrl_byte |= (tx_preset << 4);
+       else
+               /* Preserve current/determined RX CDR status */
+               *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+}
+
+static void apply_cdr_settings(
+               struct hfi1_pportdata *ppd, u32 rx_preset_index,
+               u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+
+       apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
+
+       apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
+
+       qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
+                  &cdr_ctrl_byte, 1);
+}
+
+static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u8 tx_eq;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
+               return;
+       /* Disable adaptive TX EQ if present */
+       tx_eq = cache[(128 * 3) + 241];
+       tx_eq &= 0xF0;
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
+}
+
+static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+       u32 tx_preset;
+       u8 tx_eq;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
+               return;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+               &tx_preset, 4);
+       if (!tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX_EQ_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+                       tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
+                       &tx_preset, 4);
+
+       if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: TX EQ %x unsupported\n",
+                       __func__, tx_preset);
+
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Applying EQ %x\n",
+                       __func__, cache[608] & 0xF0);
+
+               tx_preset = (cache[608] & 0xF0) >> 4;
+       }
+
+       tx_eq = tx_preset | (tx_preset << 4);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
+}
+
+static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
+{
+       u32 rx_preset;
+       u8 rx_eq, *cache = ppd->qsfp_info.cache;
+
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
+               return;
+       get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+                       rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+                       &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: RX_EMP_APPLY is set to disabled\n",
+                       __func__);
+               return;
+       }
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
+               &rx_preset, 4);
+
+       if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Requested RX EMP %x\n",
+                       __func__, rx_preset);
+
+               dd_dev_info(
+                       ppd->dd,
+                       "%s: Applying supported EMP %x\n",
+                       __func__, cache[608] & 0xF);
+
+               rx_preset = cache[608] & 0xF;
+       }
+
+       rx_eq = rx_preset | (rx_preset << 4);
+
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
+}
+
+static void apply_eq_settings(struct hfi1_pportdata *ppd,
+                             u32 rx_preset_index, u32 tx_preset_index)
+{
+       u8 *cache = ppd->qsfp_info.cache;
+
+       /* no point going on w/o a page 3 */
+       if (cache[2] & 4) {
+               dd_dev_info(ppd->dd,
+                           "%s: Upper page 03 not present\n",
+                           __func__);
+               return;
+       }
+
+       apply_tx_eq_auto(ppd);
+
+       apply_tx_eq_prog(ppd, tx_preset_index);
+
+       apply_rx_eq_emp(ppd, rx_preset_index);
+}
+
+static void apply_rx_amplitude_settings(
+               struct hfi1_pportdata *ppd, u32 rx_preset_index,
+               u32 tx_preset_index)
+{
+       u32 rx_preset;
+       u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
+
+       /* no point going on w/o a page 3 */
+       if (cache[2] & 4) {
+               dd_dev_info(ppd->dd,
+                           "%s: Upper page 03 not present\n",
+                           __func__);
+               return;
+       }
+       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
+               dd_dev_info(ppd->dd,
+                           "%s: RX_AMP_APPLY is set to disabled\n",
+                           __func__);
+               return;
+       }
+
+       get_platform_config_field(ppd->dd,
+                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
+                                 rx_preset_index,
+                                 RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+                                 &rx_preset, 4);
+
+       if (!rx_preset) {
+               dd_dev_info(ppd->dd,
+                           "%s: RX_AMP_APPLY is set to disabled\n",
+                           __func__);
+               return;
+       }
+       get_platform_config_field(ppd->dd,
+                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
+                                 rx_preset_index,
+                                 RX_PRESET_TABLE_QSFP_RX_AMP,
+                                 &rx_preset, 4);
+
+       dd_dev_info(ppd->dd,
+                   "%s: Requested RX AMP %x\n",
+                   __func__,
+                   rx_preset);
+
+       for (i = 0; i < 4; i++) {
+               if (cache[(128 * 3) + 225] & (1 << i)) {
+                       preferred = i;
+                       if (preferred == rx_preset)
+                               break;
+               }
+       }
+
+       /*
+        * Verify that preferred RX amplitude is not just a
+        * fall through of the default
+        */
+       if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
+               dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
+               return;
+       }
+
+       dd_dev_info(ppd->dd,
+                   "%s: Applying RX AMP %x\n", __func__, preferred);
+
+       rx_amp = preferred | (preferred << 4);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
+       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
+}
+
+#define OPA_INVALID_INDEX 0xFFF
+
+static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
+                          u32 config_data, const char *message)
+{
+       u8 i;
+       int ret = HCMD_SUCCESS;
+
+       for (i = 0; i < 4; i++) {
+               ret = load_8051_config(ppd->dd, field_id, i, config_data);
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(
+                               ppd->dd,
+                               "%s: %s for lane %u failed\n",
+                               message, __func__, i);
+               }
+       }
+}
+
+static void apply_tunings(
+               struct hfi1_pportdata *ppd, u32 tx_preset_index,
+               u8 tuning_method, u32 total_atten, u8 limiting_active)
+{
+       int ret = 0;
+       u32 config_data = 0, tx_preset = 0;
+       u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       /* Enable external device config if channel is limiting active */
+       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
+                        GENERAL_CONFIG, &config_data);
+       config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT);
+       config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT);
+       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
+                              GENERAL_CONFIG, config_data);
+       if (ret != HCMD_SUCCESS)
+               dd_dev_err(
+                       ppd->dd,
+                       "%s: Failed to set enable external device config\n",
+                       __func__);
+
+       config_data = 0; /* re-init  */
+       /* Pass tuning method to 8051 */
+       read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+                        &config_data);
+       config_data &= ~(0xff << TUNING_METHOD_SHIFT);
+       config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT);
+       ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+                              config_data);
+       if (ret != HCMD_SUCCESS)
+               dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
+                          __func__);
+
+       /* Set same channel loss for both TX and RX */
+       config_data = 0 | (total_atten << 16) | (total_atten << 24);
+       apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
+                      "Setting channel loss");
+
+       /* Inform 8051 of cable capabilities */
+       if (ppd->qsfp_info.cache_valid) {
+               external_device_config =
+                       ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
+                       ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
+                       ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
+                       (cache[QSFP_EQ_INFO_OFFS] & 0x4);
+               ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+                                      GENERAL_CONFIG, &config_data);
+               /* Clear, then set the external device config field */
+               config_data &= ~(u32)0xFF;
+               config_data |= external_device_config;
+               ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+                                      GENERAL_CONFIG, config_data);
+               if (ret != HCMD_SUCCESS)
+                       dd_dev_info(ppd->dd,
+                                   "%s: Failed set ext device config params\n",
+                                   __func__);
+       }
+
+       if (tx_preset_index == OPA_INVALID_INDEX) {
+               if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
+                       dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
+                                   __func__);
+               return;
+       }
+
+       /* Following for limiting active channels only */
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+               TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
+       precur = tx_preset;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
+       attn = tx_preset;
+
+       get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+               tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
+       postcur = tx_preset;
+
+       config_data = precur | (attn << 8) | (postcur << 16);
+
+       apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
+                      "Applying TX settings");
+}
+
+/* Must be holding the QSFP i2c resource */
+static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
+                           u32 *ptr_rx_preset, u32 *ptr_total_atten)
+{
+       int ret;
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       ppd->qsfp_info.limiting_active = 1;
+
+       ret = set_qsfp_tx(ppd, 0);
+       if (ret)
+               return ret;
+
+       ret = qual_power(ppd);
+       if (ret)
+               return ret;
+
+       ret = qual_bitrate(ppd);
+       if (ret)
+               return ret;
+
+       if (ppd->qsfp_info.reset_needed) {
+               reset_qsfp(ppd);
+               ppd->qsfp_info.reset_needed = 0;
+               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+       } else {
+               ppd->qsfp_info.reset_needed = 1;
+       }
+
+       ret = set_qsfp_high_power(ppd);
+       if (ret)
+               return ret;
+
+       if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+                       ptr_tx_preset, 4);
+               if (ret) {
+                       *ptr_tx_preset = OPA_INVALID_INDEX;
+                       return ret;
+               }
+       } else {
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+                       ptr_tx_preset, 4);
+               if (ret) {
+                       *ptr_tx_preset = OPA_INVALID_INDEX;
+                       return ret;
+               }
+       }
+
+       ret = get_platform_config_field(
+               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+               PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
+       if (ret) {
+               *ptr_rx_preset = OPA_INVALID_INDEX;
+               return ret;
+       }
+
+       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
+       else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
+
+       apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+       ret = set_qsfp_tx(ppd, 1);
+
+       return ret;
+}
+
+static int tune_qsfp(struct hfi1_pportdata *ppd,
+                    u32 *ptr_tx_preset, u32 *ptr_rx_preset,
+                    u8 *ptr_tuning_method, u32 *ptr_total_atten)
+{
+       u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
+       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+       int ret = 0;
+       u8 *cache = ppd->qsfp_info.cache;
+
+       switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+       case 0xA ... 0xB:
+               ret = get_platform_config_field(
+                       ppd->dd,
+                       PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G,
+                       &platform_atten, 4);
+               if (ret)
+                       return ret;
+
+               if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+                       cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
+               else if ((lss & OPA_LINK_SPEED_12_5G) &&
+                        (lse & OPA_LINK_SPEED_12_5G))
+                       cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
+
+               /* Fallback to configured attenuation if cable memory is bad */
+               if (cable_atten == 0 || cable_atten > 36) {
+                       ret = get_platform_config_field(
+                               ppd->dd,
+                               PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+                               SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+                               &cable_atten, 4);
+                       if (ret)
+                               return ret;
+               }
+
+               ret = get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+               if (ret)
+                       return ret;
+
+               *ptr_total_atten = platform_atten + cable_atten + remote_atten;
+
+               *ptr_tuning_method = OPA_PASSIVE_TUNING;
+               break;
+       case 0x0 ... 0x9: /* fallthrough */
+       case 0xC: /* fallthrough */
+       case 0xE:
+               ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
+                                      ptr_total_atten);
+               if (ret)
+                       return ret;
+
+               *ptr_tuning_method = OPA_ACTIVE_TUNING;
+               break;
+       case 0xD: /* fallthrough */
+       case 0xF:
+       default:
+               dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
+                           __func__);
+               break;
+       }
+       return ret;
+}
+
+/*
+ * This function communicates its success or failure via ppd->driver_link_ready
+ * Thus, it depends on its association with start_link(...) which checks
+ * driver_link_ready before proceeding with the link negotiation and
+ * initialization process.
+ */
+void tune_serdes(struct hfi1_pportdata *ppd)
+{
+       int ret = 0;
+       u32 total_atten = 0;
+       u32 remote_atten = 0, platform_atten = 0;
+       u32 rx_preset_index, tx_preset_index;
+       u8 tuning_method = 0, limiting_active = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+
+       rx_preset_index = OPA_INVALID_INDEX;
+       tx_preset_index = OPA_INVALID_INDEX;
+
+       /* the link defaults to enabled */
+       ppd->link_enabled = 1;
+       /* the driver link ready state defaults to not ready */
+       ppd->driver_link_ready = 0;
+       ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+       /* Skip the tuning for testing (loopback != none) and simulations */
+       if (loopback != LOOPBACK_NONE ||
+           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               ppd->driver_link_ready = 1;
+               return;
+       }
+
+       switch (ppd->port_type) {
+       case PORT_TYPE_DISCONNECTED:
+               ppd->offline_disabled_reason =
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
+               dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
+                           __func__);
+               goto bail;
+       case PORT_TYPE_FIXED:
+               /* platform_atten, remote_atten pre-zeroed to catch error */
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
+
+               get_platform_config_field(
+                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+
+               total_atten = platform_atten + remote_atten;
+
+               tuning_method = OPA_PASSIVE_TUNING;
+               break;
+       case PORT_TYPE_VARIABLE:
+               if (qsfp_mod_present(ppd)) {
+                       /*
+                        * platform_atten, remote_atten pre-zeroed to
+                        * catch error
+                        */
+                       get_platform_config_field(
+                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                               PORT_TABLE_LOCAL_ATTEN_25G,
+                               &platform_atten, 4);
+
+                       get_platform_config_field(
+                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+                               PORT_TABLE_REMOTE_ATTEN_25G,
+                               &remote_atten, 4);
+
+                       total_atten = platform_atten + remote_atten;
+
+                       tuning_method = OPA_PASSIVE_TUNING;
+               } else {
+                       ppd->offline_disabled_reason =
+                            HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+                       goto bail;
+               }
+               break;
+       case PORT_TYPE_QSFP:
+               if (qsfp_mod_present(ppd)) {
+                       ret = acquire_chip_resource(ppd->dd,
+                                                   qsfp_resource(ppd->dd),
+                                                   QSFP_WAIT);
+                       if (ret) {
+                               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+                                          __func__, (int)ppd->dd->hfi1_id);
+                               goto bail;
+                       }
+                       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+                       if (ppd->qsfp_info.cache_valid) {
+                               ret = tune_qsfp(ppd,
+                                               &tx_preset_index,
+                                               &rx_preset_index,
+                                               &tuning_method,
+                                               &total_atten);
+
+                               /*
+                                * We may have modified the QSFP memory, so
+                                * update the cache to reflect the changes
+                                */
+                               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+                               limiting_active =
+                                               ppd->qsfp_info.limiting_active;
+                       } else {
+                               dd_dev_err(dd,
+                                          "%s: Reading QSFP memory failed\n",
+                                          __func__);
+                               ret = -EINVAL; /* a fail indication */
+                       }
+                       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+                       if (ret)
+                               goto bail;
+               } else {
+                       ppd->offline_disabled_reason =
+                          HFI1_ODR_MASK(
+                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+                       goto bail;
+               }
+               break;
+       default:
+               dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
+               ppd->port_type = PORT_TYPE_UNKNOWN;
+               tuning_method = OPA_UNKNOWN_TUNING;
+               total_atten = 0;
+               limiting_active = 0;
+               tx_preset_index = OPA_INVALID_INDEX;
+               break;
+       }
+
+       if (ppd->offline_disabled_reason ==
+                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+               apply_tunings(ppd, tx_preset_index, tuning_method,
+                             total_atten, limiting_active);
+
+       if (!ret)
+               ppd->driver_link_ready = 1;
+
+       return;
+bail:
+       ppd->driver_link_ready = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h
new file mode 100644 (file)
index 0000000..e2c2161
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT               0
+#define METADATA_TABLE_FIELD_START_LEN_BITS            15
+#define METADATA_TABLE_FIELD_LEN_SHIFT                 16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS              16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT                        0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS             6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT              16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS           12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT                        28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS             4
+
+enum platform_config_table_type_encoding {
+       PLATFORM_CONFIG_TABLE_RESERVED,
+       PLATFORM_CONFIG_SYSTEM_TABLE,
+       PLATFORM_CONFIG_PORT_TABLE,
+       PLATFORM_CONFIG_RX_PRESET_TABLE,
+       PLATFORM_CONFIG_TX_PRESET_TABLE,
+       PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+       PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+       PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+       SYSTEM_TABLE_RESERVED,
+       SYSTEM_TABLE_NODE_STRING,
+       SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+       SYSTEM_TABLE_NODE_GUID,
+       SYSTEM_TABLE_REVISION,
+       SYSTEM_TABLE_VENDOR_OUI,
+       SYSTEM_TABLE_META_VERSION,
+       SYSTEM_TABLE_DEVICE_ID,
+       SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+       SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+       SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+       SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+       PORT_TABLE_RESERVED,
+       PORT_TABLE_PORT_TYPE,
+       PORT_TABLE_LOCAL_ATTEN_12G,
+       PORT_TABLE_LOCAL_ATTEN_25G,
+       PORT_TABLE_LINK_SPEED_SUPPORTED,
+       PORT_TABLE_LINK_WIDTH_SUPPORTED,
+       PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+       PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+       PORT_TABLE_VL_CAP,
+       PORT_TABLE_MTU_CAP,
+       PORT_TABLE_TX_LANE_ENABLE_MASK,
+       PORT_TABLE_LOCAL_MAX_TIMEOUT,
+       PORT_TABLE_REMOTE_ATTEN_12G,
+       PORT_TABLE_REMOTE_ATTEN_25G,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+       PORT_TABLE_RX_PRESET_IDX,
+       PORT_TABLE_CABLE_REACH_CLASS,
+       PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+       RX_PRESET_TABLE_RESERVED,
+       RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_CDR,
+       RX_PRESET_TABLE_QSFP_RX_EMP,
+       RX_PRESET_TABLE_QSFP_RX_AMP,
+       RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+       TX_PRESET_TABLE_RESERVED,
+       TX_PRESET_TABLE_PRECUR,
+       TX_PRESET_TABLE_ATTN,
+       TX_PRESET_TABLE_POSTCUR,
+       TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_CDR,
+       TX_PRESET_TABLE_QSFP_TX_EQ,
+       TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+       QSFP_ATTEN_TABLE_RESERVED,
+       QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+       VARIABLE_SETTINGS_TABLE_RESERVED,
+       VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config {
+       size_t size;
+       const u8 *data;
+};
+
+struct platform_config_data {
+       u32 *table;
+       u32 *table_metadata;
+       u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+       u8  cache_valid;
+       struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+       0,
+       SYSTEM_TABLE_MAX,
+       PORT_TABLE_MAX,
+       RX_PRESET_TABLE_MAX,
+       TX_PRESET_TABLE_MAX,
+       QSFP_ATTEN_TABLE_MAX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*
+ * =====================================================
+ *  System table encodings
+ * =====================================================
+ */
+#define PLATFORM_CONFIG_MAGIC_NUM              0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN       4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+       QSFP_POWER_CLASS_1 = 1,
+       QSFP_POWER_CLASS_2,
+       QSFP_POWER_CLASS_3,
+       QSFP_POWER_CLASS_4,
+       QSFP_POWER_CLASS_5,
+       QSFP_POWER_CLASS_6,
+       QSFP_POWER_CLASS_7
+};
+
+/*
+ * ====================================================
+ *  Port table encodings
+ * ====================================================
+ */
+enum platform_config_port_type_encoding {
+       PORT_TYPE_UNKNOWN,
+       PORT_TYPE_DISCONNECTED,
+       PORT_TYPE_FIXED,
+       PORT_TYPE_VARIABLE,
+       PORT_TYPE_QSFP,
+       PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+       LINK_SPEED_SUPP_12G = 1,
+       LINK_SPEED_SUPP_25G,
+       LINK_SPEED_SUPP_12G_25G,
+       LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+       LINK_WIDTH_SUPP_1X = 1,
+       LINK_WIDTH_SUPP_2X,
+       LINK_WIDTH_SUPP_2X_1X,
+       LINK_WIDTH_SUPP_3X,
+       LINK_WIDTH_SUPP_3X_1X,
+       LINK_WIDTH_SUPP_3X_2X,
+       LINK_WIDTH_SUPP_3X_2X_1X,
+       LINK_WIDTH_SUPP_4X,
+       LINK_WIDTH_SUPP_4X_1X,
+       LINK_WIDTH_SUPP_4X_2X,
+       LINK_WIDTH_SUPP_4X_2X_1X,
+       LINK_WIDTH_SUPP_4X_3X,
+       LINK_WIDTH_SUPP_4X_3X_1X,
+       LINK_WIDTH_SUPP_4X_3X_2X,
+       LINK_WIDTH_SUPP_4X_3X_2X_1X,
+       LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+       VL_CAP_VL0 = 1,
+       VL_CAP_VL0_1,
+       VL_CAP_VL0_2,
+       VL_CAP_VL0_3,
+       VL_CAP_VL0_4,
+       VL_CAP_VL0_5,
+       VL_CAP_VL0_6,
+       VL_CAP_VL0_7,
+       VL_CAP_VL0_8,
+       VL_CAP_VL0_9,
+       VL_CAP_VL0_10,
+       VL_CAP_VL0_11,
+       VL_CAP_VL0_12,
+       VL_CAP_VL0_13,
+       VL_CAP_VL0_14,
+       VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+       MTU_CAP_256   = 1,
+       MTU_CAP_512   = 2,
+       MTU_CAP_1024  = 3,
+       MTU_CAP_2048  = 4,
+       MTU_CAP_4096  = 5,
+       MTU_CAP_8192  = 6,
+       MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+       LOCAL_MAX_TIMEOUT_10_MS = 1,
+       LOCAL_MAX_TIMEOUT_100_MS,
+       LOCAL_MAX_TIMEOUT_1_S,
+       LOCAL_MAX_TIMEOUT_10_S,
+       LOCAL_MAX_TIMEOUT_100_S,
+       LOCAL_MAX_TIMEOUT_1000_S
+};
+
+enum link_tuning_encoding {
+       OPA_PASSIVE_TUNING,
+       OPA_ACTIVE_TUNING,
+       OPA_UNKNOWN_TUNING
+};
+
+/* platform.c */
+void get_platform_config(struct hfi1_devdata *dd);
+void free_platform_config(struct hfi1_devdata *dd);
+void get_port_type(struct hfi1_pportdata *ppd);
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
+void tune_serdes(struct hfi1_pportdata *ppd);
+
+#endif                 /*__PLATFORM_H*/
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
new file mode 100644 (file)
index 0000000..1a942ff
--- /dev/null
@@ -0,0 +1,974 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "verbs_txreq.h"
+
+unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct rvt_qp *qp);
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+static void iowait_sdma_drained(struct iowait *wait);
+static void qp_pio_drain(struct rvt_qp *qp);
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+                             struct rvt_qpn_map *map, unsigned off)
+{
+       return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+       0,                      /* 0 */
+       1,                      /* 1 */
+       2,                      /* 2 */
+       3,                      /* 3 */
+       4,                      /* 4 */
+       6,                      /* 5 */
+       8,                      /* 6 */
+       12,                     /* 7 */
+       16,                     /* 8 */
+       24,                     /* 9 */
+       32,                     /* A */
+       48,                     /* B */
+       64,                     /* C */
+       96,                     /* D */
+       128,                    /* E */
+       192,                    /* F */
+       256,                    /* 10 */
+       384,                    /* 11 */
+       512,                    /* 12 */
+       768,                    /* 13 */
+       1024,                   /* 14 */
+       1536,                   /* 15 */
+       2048,                   /* 16 */
+       3072,                   /* 17 */
+       4096,                   /* 18 */
+       6144,                   /* 19 */
+       8192,                   /* 1A */
+       12288,                  /* 1B */
+       16384,                  /* 1C */
+       24576,                  /* 1D */
+       32768                   /* 1E */
+};
+
+static void flush_tx_list(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       while (!list_empty(&priv->s_iowait.tx_head)) {
+               struct sdma_txreq *tx;
+
+               tx = list_first_entry(
+                       &priv->s_iowait.tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&tx->list);
+               hfi1_put_txreq(
+                       container_of(tx, struct verbs_txreq, txreq));
+       }
+}
+
+static void flush_iowait(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(&priv->s_iowait.list)) {
+               list_del_init(&priv->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default:            return -1;
+       }
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+       int val;
+
+       /* Constraining 10KB packets to 8KB packets */
+       if (mtu == (enum ib_mtu)OPA_MTU_10240)
+               mtu = OPA_MTU_8192;
+       val = opa_mtu_enum_to_int((int)mtu);
+       if (val > 0)
+               return val;
+       return ib_mtu_enum_to_int(mtu);
+}
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                        int attr_mask, struct ib_udata *udata)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       u8 sc;
+
+       if (attr_mask & IB_QP_AV) {
+               sc = ah_to_sc(ibqp->device, &attr->ah_attr);
+               if (sc == 0xf)
+                       return -EINVAL;
+
+               if (!qp_to_sdma_engine(qp, sc) &&
+                   dd->flags & HFI1_HAS_SEND_DMA)
+                       return -EINVAL;
+
+               if (!qp_to_send_context(qp, sc))
+                       return -EINVAL;
+       }
+
+       if (attr_mask & IB_QP_ALT_PATH) {
+               sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
+               if (sc == 0xf)
+                       return -EINVAL;
+
+               if (!qp_to_sdma_engine(qp, sc) &&
+                   dd->flags & HFI1_HAS_SEND_DMA)
+                       return -EINVAL;
+
+               if (!qp_to_send_context(qp, sc))
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (attr_mask & IB_QP_AV) {
+               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE &&
+           attr->path_mig_state == IB_MIG_MIGRATED &&
+           qp->s_mig_state == IB_MIG_ARMED) {
+               qp->s_flags |= RVT_S_AHG_CLEAR;
+               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       }
+}
+
+/**
+ * hfi1_check_send_wqe - validate wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ *
+ */
+int hfi1_check_send_wqe(struct rvt_qp *qp,
+                       struct rvt_swqe *wqe)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct rvt_ah *ah;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               if (wqe->length > 0x80000000U)
+                       return -EINVAL;
+               break;
+       case IB_QPT_SMI:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               break;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               if (wqe->length > (1 << ah->log_pmtu))
+                       return -EINVAL;
+               if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
+                       return -EINVAL;
+       default:
+               break;
+       }
+       return wqe->length <= piothreshold;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp)
+{
+       u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+       if (qp->ibqp.srq) {
+               /*
+                * Shared receive queues don't generate credits.
+                * Set the credit field to the invalid value.
+                */
+               aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+       } else {
+               u32 min, max, x;
+               u32 credits;
+               struct rvt_rwq *wq = qp->r_rq.wq;
+               u32 head;
+               u32 tail;
+
+               /* sanity check pointers before trusting them */
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               /*
+                * Compute the number of credits available (RWQEs).
+                * There is a small chance that the pair of reads are
+                * not atomic, which is OK, since the fuzziness is
+                * resolved as further ACKs go out.
+                */
+               credits = head - tail;
+               if ((int)credits < 0)
+                       credits += qp->r_rq.size;
+               /*
+                * Binary search the credit table to find the code to
+                * use.
+                */
+               min = 0;
+               max = 31;
+               for (;;) {
+                       x = (min + max) / 2;
+                       if (credit_table[x] == credits)
+                               break;
+                       if (credit_table[x] > credits) {
+                               max = x;
+                       } else {
+                               if (min == x)
+                                       break;
+                               min = x;
+                       }
+               }
+               aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+       }
+       return cpu_to_be32(aeth);
+}
+
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+void _hfi1_schedule_send(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibport *ibp =
+               to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+                       priv->s_sde ?
+                       priv->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(dd->node)));
+}
+
+static void qp_pio_drain(struct rvt_qp *qp)
+{
+       struct hfi1_ibdev *dev;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (!priv->s_sendcontext)
+               return;
+       dev = to_idev(qp->ibqp.device);
+       while (iowait_pio_pending(&priv->s_iowait)) {
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
+               write_sequnlock_irq(&dev->iowait_lock);
+               iowait_pio_drain(&priv->s_iowait);
+               write_seqlock_irq(&dev->iowait_lock);
+               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
+               write_sequnlock_irq(&dev->iowait_lock);
+       }
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+void hfi1_schedule_send(struct rvt_qp *qp)
+{
+       if (hfi1_send_ok(qp))
+               _hfi1_schedule_send(qp);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth)
+{
+       u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+       /*
+        * If the credit is invalid, we can send
+        * as many packets as we like.  Otherwise, we have to
+        * honor the credit field.
+        */
+       if (credit == HFI1_AETH_CREDIT_INVAL) {
+               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+                       qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
+                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+               /* Compute new LSN (i.e., MSN + credit) */
+               credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+               if (cmp_msn(credit, qp->s_lsn) > 0) {
+                       qp->s_lsn = credit;
+                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       }
+}
+
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & flag) {
+               qp->s_flags &= ~flag;
+               trace_hfi1_qpwakeup(qp, flag);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       /* Notify hfi1_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq)
+{
+       struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+       struct rvt_qp *qp;
+       struct hfi1_qp_priv *priv;
+       unsigned long flags;
+       int ret = 0;
+       struct hfi1_ibdev *dev;
+
+       qp = tx->qp;
+       priv = qp->priv;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               /*
+                * If we couldn't queue the DMA request, save the info
+                * and try again later rather than destroying the
+                * buffer and undoing the side effects of the copy.
+                */
+               /* Make a common routine? */
+               dev = &sde->dd->verbs_dev;
+               list_add_tail(&stx->list, &wait->tx_head);
+               write_seqlock(&dev->iowait_lock);
+               if (sdma_progress(sde, seq, stx))
+                       goto eagain;
+               if (list_empty(&priv->s_iowait.list)) {
+                       struct hfi1_ibport *ibp =
+                               to_iport(qp->ibqp.device, qp->port_num);
+
+                       ibp->rvp.n_dmawait++;
+                       qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+                       list_add_tail(&priv->s_iowait.list, &sde->dmawait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               ret = -EBUSY;
+       } else {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_put_txreq(tx);
+       }
+       return ret;
+eagain:
+       write_sequnlock(&dev->iowait_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       list_del_init(&stx->list);
+       return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+       struct rvt_qp *qp = iowait_to_qp(wait);
+
+       WARN_ON(reason != SDMA_AVAIL_REASON);
+       hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
+}
+
+static void iowait_sdma_drained(struct iowait *wait)
+{
+       struct rvt_qp *qp = iowait_to_qp(wait);
+       unsigned long flags;
+
+       /*
+        * This happens when the send engine notes
+        * a QP in the error state and cannot
+        * do the flush work until that QP's
+        * sdma work has finished.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & RVT_S_WAIT_DMA) {
+               qp->s_flags &= ~RVT_S_WAIT_DMA;
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct sdma_engine *sde;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return NULL;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               return NULL;
+       default:
+               break;
+       }
+       sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+       return sde;
+}
+
+/*
+ * qp_to_send_context - map a qp to a send context
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send context for the qp
+ */
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               /* SMA packets to VL15 */
+               return dd->vld[15].sc;
+       default:
+               break;
+       }
+
+       return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
+                                         sc5);
+}
+
+struct qp_iter {
+       struct hfi1_ibdev *dev;
+       struct rvt_qp *qp;
+       int specials;
+       int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+       struct qp_iter *iter;
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return NULL;
+
+       iter->dev = dev;
+       iter->specials = dev->rdi.ibdev.phys_port_cnt * 2;
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+       struct hfi1_ibdev *dev = iter->dev;
+       int n = iter->n;
+       int ret = 1;
+       struct rvt_qp *pqp = iter->qp;
+       struct rvt_qp *qp;
+
+       /*
+        * The approach is to consider the special qps
+        * as an additional table entries before the
+        * real hash table.  Since the qp code sets
+        * the qp->next hash link to NULL, this works just fine.
+        *
+        * iter->specials is 2 * # ports
+        *
+        * n = 0..iter->specials is the special qp indices
+        *
+        * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are
+        * the potential hash bucket entries
+        *
+        */
+       for (; n <  dev->rdi.qp_dev->qp_table_size + iter->specials; n++) {
+               if (pqp) {
+                       qp = rcu_dereference(pqp->next);
+               } else {
+                       if (n < iter->specials) {
+                               struct hfi1_pportdata *ppd;
+                               struct hfi1_ibport *ibp;
+                               int pidx;
+
+                               pidx = n % dev->rdi.ibdev.phys_port_cnt;
+                               ppd = &dd_from_dev(dev)->pport[pidx];
+                               ibp = &ppd->ibport_data;
+
+                               if (!(n & 1))
+                                       qp = rcu_dereference(ibp->rvp.qp[0]);
+                               else
+                                       qp = rcu_dereference(ibp->rvp.qp[1]);
+                       } else {
+                               qp = rcu_dereference(
+                                       dev->rdi.qp_dev->qp_table[
+                                               (n - iter->specials)]);
+                       }
+               }
+               pqp = qp;
+               if (qp) {
+                       iter->qp = qp;
+                       iter->n = n;
+                       return 0;
+               }
+       }
+       return ret;
+}
+
+static const char * const qp_type_str[] = {
+       "SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct rvt_qp *qp)
+{
+       return
+               qp->s_last == qp->s_acked &&
+               qp->s_acked == qp->s_cur &&
+               qp->s_cur == qp->s_tail &&
+               qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+       struct rvt_swqe *wqe;
+       struct rvt_qp *qp = iter->qp;
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct sdma_engine *sde;
+       struct send_context *send_context;
+
+       sde = qp_to_sdma_engine(qp, priv->s_sc);
+       wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+       send_context = qp_to_send_context(qp, priv->s_sc);
+       seq_printf(s,
+                  "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
+                  iter->n,
+                  qp_idle(qp) ? "I" : "B",
+                  qp->ibqp.qp_num,
+                  atomic_read(&qp->refcount),
+                  qp_type_str[qp->ibqp.qp_type],
+                  qp->state,
+                  wqe ? wqe->wr.opcode : 0,
+                  qp->s_hdrwords,
+                  qp->s_flags,
+                  iowait_sdma_pending(&priv->s_iowait),
+                  iowait_pio_pending(&priv->s_iowait),
+                  !list_empty(&priv->s_iowait.list),
+                  qp->timeout,
+                  wqe ? wqe->ssn : 0,
+                  qp->s_lsn,
+                  qp->s_last_psn,
+                  qp->s_psn, qp->s_next_psn,
+                  qp->s_sending_psn, qp->s_sending_hpsn,
+                  qp->s_last, qp->s_acked, qp->s_cur,
+                  qp->s_tail, qp->s_head, qp->s_size,
+                  qp->s_avail,
+                  qp->remote_qpn,
+                  qp->remote_ah_attr.dlid,
+                  qp->remote_ah_attr.sl,
+                  qp->pmtu,
+                  qp->s_retry,
+                  qp->s_retry_cnt,
+                  qp->s_rnr_retry_cnt,
+                  sde,
+                  sde ? sde->this_idx : 0,
+                  send_context,
+                  send_context ? send_context->sw_index : 0,
+                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
+                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+                  qp->pid);
+}
+
+void qp_comm_est(struct rvt_qp *qp)
+{
+       qp->r_flags |= RVT_R_COMM_EST;
+       if (qp->ibqp.event_handler) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_COMM_EST;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                   gfp_t gfp)
+{
+       struct hfi1_qp_priv *priv;
+
+       priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node);
+       if (!priv)
+               return ERR_PTR(-ENOMEM);
+
+       priv->owner = qp;
+
+       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
+       if (!priv->s_hdr) {
+               kfree(priv);
+               return ERR_PTR(-ENOMEM);
+       }
+       setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp);
+       qp->s_timer.function = hfi1_rc_timeout;
+       return priv;
+}
+
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       kfree(priv->s_hdr);
+       kfree(priv);
+}
+
+unsigned free_all_qps(struct rvt_dev_info *rdi)
+{
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       int n;
+       unsigned qp_inuse = 0;
+
+       for (n = 0; n < dd->num_pports; n++) {
+               struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+               rcu_read_lock();
+               if (rcu_dereference(ibp->rvp.qp[0]))
+                       qp_inuse++;
+               if (rcu_dereference(ibp->rvp.qp[1]))
+                       qp_inuse++;
+               rcu_read_unlock();
+       }
+
+       return qp_inuse;
+}
+
+void flush_qp_waiters(struct rvt_qp *qp)
+{
+       flush_iowait(qp);
+       hfi1_stop_rc_timers(qp);
+}
+
+void stop_send_queue(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       cancel_work_sync(&priv->s_iowait.iowork);
+       hfi1_del_timers_sync(qp);
+}
+
+void quiesce_qp(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       iowait_sdma_drain(&priv->s_iowait);
+       qp_pio_drain(qp);
+       flush_tx_list(qp);
+}
+
+void notify_qp_reset(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       iowait_init(
+               &priv->s_iowait,
+               1,
+               _hfi1_do_send,
+               iowait_sleep,
+               iowait_wakeup,
+               iowait_sdma_drained);
+       priv->r_adefered = 0;
+       clear_ahg(qp);
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct ib_event ev;
+
+       qp->s_mig_state = IB_MIG_MIGRATED;
+       qp->remote_ah_attr = qp->alt_ah_attr;
+       qp->port_num = qp->alt_ah_attr.port_num;
+       qp->s_pkey_index = qp->s_alt_pkey_index;
+       qp->s_flags |= RVT_S_AHG_CLEAR;
+       priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
+       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+
+       ev.device = qp->ibqp.device;
+       ev.element.qp = &qp->ibqp;
+       ev.event = IB_EVENT_PATH_MIG;
+       qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+int mtu_to_path_mtu(u32 mtu)
+{
+       return mtu_to_enum(mtu, OPA_MTU_8192);
+}
+
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
+{
+       u32 mtu;
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       struct hfi1_ibport *ibp;
+       u8 sc, vl;
+
+       ibp = &dd->pport[qp->port_num - 1].ibport_data;
+       sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       vl = sc_to_vlt(dd, sc);
+
+       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
+       if (vl < PER_VL_SEND_CONTEXTS)
+               mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+       return mtu;
+}
+
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                      struct ib_qp_attr *attr)
+{
+       int mtu, pidx = qp->port_num - 1;
+       struct hfi1_ibdev *verbs_dev = container_of(rdi,
+                                                   struct hfi1_ibdev,
+                                                   rdi);
+       struct hfi1_devdata *dd = container_of(verbs_dev,
+                                              struct hfi1_devdata,
+                                              verbs_dev);
+       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
+       if (mtu == -1)
+               return -1; /* values less than 0 are error */
+
+       if (mtu > dd->pport[pidx].ibmtu)
+               return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+       else
+               return attr->path_mtu;
+}
+
+void notify_error_qp(struct rvt_qp *qp)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       write_seqlock(&dev->iowait_lock);
+       if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
+               qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+               list_del_init(&priv->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock(&dev->iowait_lock);
+
+       if (!(qp->s_flags & RVT_S_BUSY)) {
+               qp->s_hdrwords = 0;
+               if (qp->s_rdma_mr) {
+                       rvt_put_mr(qp->s_rdma_mr);
+                       qp->s_rdma_mr = NULL;
+               }
+               flush_tx_list(qp);
+       }
+}
+
+/**
+ * hfi1_error_port_qps - put a port's RC/UC qps into error state
+ * @ibp: the ibport.
+ * @sl: the service level.
+ *
+ * This function places all RC/UC qps with a given service level into error
+ * state. It is generally called to force upper lay apps to abandon stale qps
+ * after an sl->sc mapping change.
+ */
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
+{
+       struct rvt_qp *qp = NULL;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
+       int n;
+       int lastwqe;
+       struct ib_event ev;
+
+       rcu_read_lock();
+
+       /* Deal only with RC/UC qps that use the given SL. */
+       for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) {
+               for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp;
+                       qp = rcu_dereference(qp->next)) {
+                       if (qp->port_num == ppd->port &&
+                           (qp->ibqp.qp_type == IB_QPT_UC ||
+                            qp->ibqp.qp_type == IB_QPT_RC) &&
+                           qp->remote_ah_attr.sl == sl &&
+                           (ib_rvt_state_ops[qp->state] &
+                            RVT_POST_SEND_OK)) {
+                               spin_lock_irq(&qp->r_lock);
+                               spin_lock(&qp->s_hlock);
+                               spin_lock(&qp->s_lock);
+                               lastwqe = rvt_error_qp(qp,
+                                                      IB_WC_WR_FLUSH_ERR);
+                               spin_unlock(&qp->s_lock);
+                               spin_unlock(&qp->s_hlock);
+                               spin_unlock_irq(&qp->r_lock);
+                               if (lastwqe) {
+                                       ev.device = qp->ibqp.device;
+                                       ev.element.qp = &qp->ibqp;
+                                       ev.event =
+                                               IB_EVENT_QP_LAST_WQE_REACHED;
+                                       qp->ibqp.event_handler(&ev,
+                                               qp->ibqp.qp_context);
+                               }
+                       }
+               }
+       }
+
+       rcu_read_unlock();
+}
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
new file mode 100644 (file)
index 0000000..e7bc8d6
--- /dev/null
@@ -0,0 +1,160 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include <rdma/rdmavt_qp.h>
+#include "verbs.h"
+#include "sdma.h"
+
+extern unsigned int hfi1_qp_table_size;
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+static inline void clear_ahg(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       priv->s_hdr->ahgcount = 0;
+       qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
+       if (priv->s_sde && qp->s_ahgidx >= 0)
+               sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
+       qp->s_ahgidx = -1;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata);
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_wakeup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - initialize the iterator for the qp hash list
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - Find the next qp in the hash list
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_print - print the qp information to seq_file
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct rvt_qp *qp);
+
+void _hfi1_schedule_send(struct rvt_qp *qp);
+void hfi1_schedule_send(struct rvt_qp *qp);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+/*
+ * Functions provided by hfi1 driver for rdmavt to use
+ */
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                   gfp_t gfp);
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+unsigned free_all_qps(struct rvt_dev_info *rdi);
+void notify_qp_reset(struct rvt_qp *qp);
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+                      struct ib_qp_attr *attr);
+void flush_qp_waiters(struct rvt_qp *qp);
+void notify_error_qp(struct rvt_qp *qp);
+void stop_send_queue(struct rvt_qp *qp);
+void quiesce_qp(struct rvt_qp *qp);
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
+int mtu_to_path_mtu(u32 mtu);
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+#endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
new file mode 100644 (file)
index 0000000..2441669
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
+ * in twsi.c
+ */
+#define I2C_MAX_RETRY 4
+
+/*
+ * Raw i2c write.  No set-up or lock checking.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                      int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt;
+       u8 *buff = bp;
+
+       cnt = 0;
+       while (cnt < len) {
+               int wlen = len - cnt;
+
+               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
+                                      buff + cnt, wlen);
+               if (ret) {
+                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
+                       return -EIO;
+               }
+               offset += wlen;
+               cnt += wlen;
+       }
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return cnt;
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ */
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+             void *bp, int len)
+{
+       int ret;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "I2C chain %d write interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+}
+
+/*
+ * Raw i2c read.  No set-up or lock checking.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                     int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt, pass = 0;
+       int orig_offset = offset;
+
+       cnt = 0;
+       while (cnt < len) {
+               int rlen = len - cnt;
+
+               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
+                                      bp + cnt, rlen);
+               /* Some QSFP's fail first try. Retry as experiment */
+               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
+                       continue;
+               if (ret) {
+                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
+                       ret = -EIO;
+                       goto exit;
+               }
+               offset += rlen;
+               cnt += rlen;
+       }
+
+       ret = cnt;
+
+exit:
+       if (ret < 0) {
+               hfi1_dev_porterr(dd, ppd->port,
+                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
+                                target, i2c_addr, orig_offset, len);
+       }
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return ret;
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ */
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+            void *bp, int len)
+{
+       int ret;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "I2C chain %d read interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+}
+
+/*
+ * Write page n, offset m of QSFP memory as defined by SFF 8636
+ * by writing @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ */
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len)
+{
+       int count = 0;
+       int offset;
+       int nwrite;
+       int ret;
+       u8 page;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "QSFP chain %d write interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based address
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(ppd->dd, ppd->port,
+                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+                                        target, ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               offset = addr % QSFP_PAGESIZE;
+               nwrite = len - count;
+               /* truncate write to boundary if crossing boundary */
+               if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
+                       nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 offset, bp + count, nwrite);
+               if (ret <= 0)   /* stop on error or nothing written */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP write.  Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                  int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 resource = qsfp_resource(dd);
+       int ret;
+
+       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+       if (ret)
+               return ret;
+       ret = qsfp_write(ppd, target, addr, bp, len);
+       release_chip_resource(dd, resource);
+
+       return ret;
+}
+
+/*
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * by reading @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ */
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len)
+{
+       int count = 0;
+       int offset;
+       int nread;
+       int ret;
+       u8 page;
+
+       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+               return -EACCES;
+
+       /* make sure the TWSI bus is in a sane state */
+       ret = hfi1_twsi_reset(ppd->dd, target);
+       if (ret) {
+               hfi1_dev_porterr(ppd->dd, ppd->port,
+                                "QSFP chain %d read interface reset failed\n",
+                                target);
+               return ret;
+       }
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based address
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(ppd->dd, ppd->port,
+                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+                                        target, ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               offset = addr % QSFP_PAGESIZE;
+               nread = len - count;
+               /* truncate read to boundary if crossing boundary */
+               if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
+                       nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+                                offset, bp + count, nread);
+               if (ret <= 0)   /* stop on error or nothing read */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP read.  Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                 int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 resource = qsfp_resource(dd);
+       int ret;
+
+       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+       if (ret)
+               return ret;
+       ret = qsfp_read(ppd, target, addr, bp, len);
+       release_chip_resource(dd, resource);
+
+       return ret;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * in the cache by reading byte ((128 * n) + m)
+ * The calls to qsfp_{read,write} in this function correctly handle the
+ * address map difference between this mapping and the mapping implemented
+ * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+       u32 target = ppd->dd->hfi1_id;
+       int ret;
+       unsigned long flags;
+       u8 *cache = &cp->cache[0];
+
+       /* ensure sane contents on invalid reads, for cable swaps */
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+       ppd->qsfp_info.cache_valid = 0;
+       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
+       if (ret != QSFP_PAGESIZE) {
+               dd_dev_info(ppd->dd,
+                           "%s: Page 0 read failed, expected %d, got %d\n",
+                           __func__, QSFP_PAGESIZE, ret);
+               goto bail;
+       }
+
+       /* Is paging enabled? */
+       if (!(cache[2] & 4)) {
+               /* Paging enabled, page 03 required */
+               if ((cache[195] & 0xC0) == 0xC0) {
+                       /* all */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x80) == 0x80) {
+                       /* only page 2 and 3 */
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x40) == 0x40) {
+                       /* only page 1 and 3 */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               } else {
+                       /* only page 3 */
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
+                               goto bail;
+                       }
+               }
+       }
+
+       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+       ppd->qsfp_info.cache_valid = 1;
+       ppd->qsfp_info.cache_refresh_required = 0;
+       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+       return 0;
+
+bail:
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+       return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+       "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+       "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+       "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+       "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED   0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+       if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+               /* power classes count from 1, their bit encodings from 0 */
+               return (QSFP_PWR(power_byte) + 1);
+       /*
+        * 00 in the high power classes stands for unused, bringing
+        * balance to the off-by-1 offset above, we add 4 here to
+        * account for the difference between the low and high power
+        * groups
+        */
+       return (QSFP_HIGH_PWR(power_byte) + 4);
+}
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+
+       reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+       return !(reg & QSFP_HFI0_MODPRST_N);
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+                  u8 *data)
+{
+       struct hfi1_pportdata *ppd;
+       u32 excess_len = 0;
+       int ret = 0;
+
+       if (port_num > dd->num_pports || port_num < 1) {
+               dd_dev_info(dd, "%s: Invalid port number %d\n",
+                           __func__, port_num);
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       ppd = dd->pport + (port_num - 1);
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto set_zeroes;
+       }
+
+       if (!ppd->qsfp_info.cache_valid) {
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+               ret = -ERANGE;
+               goto set_zeroes;
+       }
+
+       if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+               excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+               memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+               data += (len - excess_len);
+               goto set_zeroes;
+       }
+
+       memcpy(data, &ppd->qsfp_info.cache[addr], len);
+       return 0;
+
+set_zeroes:
+       memset(data, 0, excess_len);
+       return ret;
+}
+
+static const char *pwr_codes[8] = {"N/AW",
+                                 "1.5W",
+                                 "2.0W",
+                                 "2.5W",
+                                 "3.5W",
+                                 "4.0W",
+                                 "4.5W",
+                                 "5.0W"
+                                };
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+       u8 *cache = &ppd->qsfp_info.cache[0];
+       u8 bin_buff[QSFP_DUMP_CHUNK];
+       char lenstr[6];
+       int sofar;
+       int bidx = 0;
+       u8 *atten = &cache[QSFP_ATTEN_OFFS];
+       u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+       u8 power_byte = 0;
+
+       sofar = 0;
+       lenstr[0] = ' ';
+       lenstr[1] = '\0';
+
+       if (ppd->qsfp_info.cache_valid) {
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+
+               power_byte = cache[QSFP_MOD_PWR_OFFS];
+               sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+                               pwr_codes[get_qsfp_power_class(power_byte)]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+                               lenstr,
+                       hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+                                  QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+                                  QSFP_OUI(vendor_oui));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+                                  QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+                                  QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sofar += scnprintf(buf + sofar, len - sofar,
+                               "Atten:%d, %d\n",
+                               QSFP_ATTEN_SDR(atten),
+                               QSFP_ATTEN_DDR(atten));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+                                  QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+                                  QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+                                  QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+               while (bidx < QSFP_DEFAULT_HDR_CNT) {
+                       int iidx;
+
+                       memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+                       for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+                               sofar += scnprintf(buf + sofar, len - sofar,
+                                       " %02X", bin_buff[iidx]);
+                       }
+                       sofar += scnprintf(buf + sofar, len - sofar, "\n");
+                       bidx += QSFP_DUMP_CHUNK;
+               }
+       }
+       return sofar;
+}
diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h
new file mode 100644 (file)
index 0000000..dadc66c
--- /dev/null
@@ -0,0 +1,240 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES     5
+
+/*
+ * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK    BIT(0)
+#define QSFP_HFI0_I2CDAT    BIT(1)
+#define QSFP_HFI0_RESET_N   BIT(2)
+#define QSFP_HFI0_INT_N            BIT(3)
+#define QSFP_HFI0_MODPRST_N BIT(4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+/* Reads/writes cannot cross 128 byte boundaries */
+#define QSFP_RW_BOUNDARY 128
+
+/* number of bytes in i2c offset for QSFP devices */
+#define __QSFP_OFFSET_SIZE 1                           /* num address bytes */
+#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8)     /* shifted value */
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+#define QSFP_TX_CTRL_BYTE_OFFS 86
+#define QSFP_PWR_CTRL_BYTE_OFFS 93
+#define QSFP_CDR_CTRL_BYTE_OFFS 98
+
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
+#define QSFP_NOM_BIT_RATE_100_OFFS 140
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+                       oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/*
+ * Bytes 188,189 are Wavelength tolerance, if optical
+ * If copper, they are attenuation in dB:
+ * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
+ */
+#define QSFP_CU_ATTEN_7G_OFFS 188
+#define QSFP_CU_ATTEN_12G_OFFS 189
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+#define QSFP_EQ_INFO_OFFS 193
+#define QSFP_CDR_INFO_OFFS 194
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
+#define QSFP_NOM_BIT_RATE_250_OFFS 222
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY            0x01
+
+#define QSFP_HIGH_TEMP_ALARM           0x80
+#define QSFP_LOW_TEMP_ALARM            0x40
+#define QSFP_HIGH_TEMP_WARNING         0x20
+#define QSFP_LOW_TEMP_WARNING          0x10
+
+#define QSFP_HIGH_VCC_ALARM            0x80
+#define QSFP_LOW_VCC_ALARM             0x40
+#define QSFP_HIGH_VCC_WARNING          0x20
+#define QSFP_LOW_VCC_WARNING           0x10
+
+#define QSFP_HIGH_POWER_ALARM          0x88
+#define QSFP_LOW_POWER_ALARM           0x44
+#define QSFP_HIGH_POWER_WARNING                0x22
+#define QSFP_LOW_POWER_WARNING         0x11
+
+#define QSFP_HIGH_BIAS_ALARM           0x88
+#define QSFP_LOW_BIAS_ALARM            0x44
+#define QSFP_HIGH_BIAS_WARNING         0x22
+#define QSFP_LOW_BIAS_WARNING          0x11
+
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+struct qsfp_data {
+       /* Helps to find our way */
+       struct hfi1_pportdata *ppd;
+       struct work_struct qsfp_work;
+       u8 cache[QSFP_MAX_NUM_PAGES * 128];
+       /* protect qsfp data */
+       spinlock_t qsfp_lock;
+       u8 check_interrupt_flags;
+       u8 reset_needed;
+       u8 limiting_active;
+       u8 cache_valid;
+       u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+                      struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+                  u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+             int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+            int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len);
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                  int len);
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+                 int len);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
new file mode 100644 (file)
index 0000000..792f15e
--- /dev/null
@@ -0,0 +1,2580 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+/**
+ * hfi1_add_retry_timer - add/start a retry timer
+ * @qp - the QP
+ *
+ * add a retry timer on the QP
+ */
+static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+       qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       qp->s_timer.expires = jiffies + qp->timeout_jiffies +
+                             rdi->busy_jiffies;
+       add_timer(&qp->s_timer);
+}
+
+/**
+ * hfi1_add_rnr_timer - add/start an rnr timer
+ * @qp - the QP
+ * @to - timeout in usecs
+ *
+ * add an rnr timer on the QP
+ */
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       qp->s_flags |= RVT_S_WAIT_RNR;
+       qp->s_timer.expires = jiffies + usecs_to_jiffies(to);
+       add_timer(&priv->s_rnr_timer);
+}
+
+/**
+ * hfi1_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ *
+ * Modify a potentially already running retry
+ * timer
+ */
+static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+       qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
+                 rdi->busy_jiffies);
+}
+
+/**
+ * hfi1_stop_retry_timer - stop a retry timer
+ * @qp - the QP
+ *
+ * stop a retry timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
+{
+       int rval = 0;
+
+       /* Remove QP from retry */
+       if (qp->s_flags & RVT_S_TIMER) {
+               qp->s_flags &= ~RVT_S_TIMER;
+               rval = del_timer(&qp->s_timer);
+       }
+       return rval;
+}
+
+/**
+ * hfi1_stop_rc_timers - stop all timers
+ * @qp - the QP
+ *
+ * stop any pending timers
+ */
+void hfi1_stop_rc_timers(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Remove QP from all timers */
+       if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+               qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+               del_timer(&qp->s_timer);
+               del_timer(&priv->s_rnr_timer);
+       }
+}
+
+/**
+ * hfi1_stop_rnr_timer - stop an rnr timer
+ * @qp - the QP
+ *
+ * stop an rnr timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
+{
+       int rval = 0;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Remove QP from rnr timer */
+       if (qp->s_flags & RVT_S_WAIT_RNR) {
+               qp->s_flags &= ~RVT_S_WAIT_RNR;
+               rval = del_timer(&priv->s_rnr_timer);
+       }
+       return rval;
+}
+
+/**
+ * hfi1_del_timers_sync - wait for any timeout routines to exit
+ * @qp - the QP
+ */
+void hfi1_del_timers_sync(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       del_timer_sync(&qp->s_timer);
+       del_timer_sync(&priv->s_rnr_timer);
+}
+
+/* only opcode mask for adaptive pio */
+const u32 rc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
+       BIT(OP(ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
+       BIT(OP(COMPARE_SWAP & 0x1f)) |
+       BIT(OP(FETCH_ADD & 0x1f));
+
+static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+                      u32 psn, u32 pmtu)
+{
+       u32 len;
+
+       len = delta_psn(psn, wqe->psn) * pmtu;
+       ss->sge = wqe->sg_list[0];
+       ss->sg_list = wqe->sg_list + 1;
+       ss->num_sge = wqe->wr.num_sge;
+       ss->total_len = wqe->length;
+       hfi1_skip_sge(ss, len, 0);
+       return wqe->length - len;
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @ps: the xmit packet state
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
+                      struct hfi1_other_headers *ohdr,
+                      struct hfi1_pkt_state *ps)
+{
+       struct rvt_ack_entry *e;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+       int middle = 0;
+       u32 pmtu = qp->pmtu;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       /* Don't send an ACK if we aren't supposed to. */
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+               goto bail;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       switch (qp->s_ack_state) {
+       case OP(RDMA_READ_RESPONSE_LAST):
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               /* FALLTHROUGH */
+       case OP(ATOMIC_ACKNOWLEDGE):
+               /*
+                * We can increment the tail pointer now that the last
+                * response has been sent instead of only being
+                * constructed.
+                */
+               if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+                       qp->s_tail_ack_queue = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_ONLY):
+       case OP(ACKNOWLEDGE):
+               /* Check for no next entry in the queue. */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+                       if (qp->s_flags & RVT_S_ACK_PENDING)
+                               goto normal;
+                       goto bail;
+               }
+
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST)) {
+                       /*
+                        * If a RDMA read response is being resent and
+                        * we haven't seen the duplicate request yet,
+                        * then stop sending the remaining responses the
+                        * responder has seen until the requester re-sends it.
+                        */
+                       len = e->rdma_sge.sge_length;
+                       if (len && !e->rdma_sge.mr) {
+                               qp->s_tail_ack_queue = qp->r_head_ack_queue;
+                               goto bail;
+                       }
+                       /* Copy SGE state in case we need to resend */
+                       ps->s_txreq->mr = e->rdma_sge.mr;
+                       if (ps->s_txreq->mr)
+                               rvt_get_mr(ps->s_txreq->mr);
+                       qp->s_ack_rdma_sge.sge = e->rdma_sge;
+                       qp->s_ack_rdma_sge.num_sge = 1;
+                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
+                       if (len > pmtu) {
+                               len = pmtu;
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+                       } else {
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+                               e->sent = 1;
+                       }
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_rdma_psn = e->psn;
+                       bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               } else {
+                       /* COMPARE_SWAP or FETCH_ADD */
+                       qp->s_cur_sge = NULL;
+                       len = 0;
+                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+                       ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+                       ohdr->u.at.atomic_ack_eth[0] =
+                               cpu_to_be32(e->atomic_data >> 32);
+                       ohdr->u.at.atomic_ack_eth[1] =
+                               cpu_to_be32(e->atomic_data);
+                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
+                       bth2 = mask_psn(e->psn);
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               qp->s_cur_sge = &qp->s_ack_rdma_sge;
+               ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
+               if (ps->s_txreq->mr)
+                       rvt_get_mr(ps->s_txreq->mr);
+               len = qp->s_ack_rdma_sge.sge.sge_length;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+               } else {
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+                       e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               break;
+
+       default:
+normal:
+               /*
+                * Send a regular ACK.
+                * Set the s_ack_state so we wait until after sending
+                * the ACK before setting s_ack_state to ACKNOWLEDGE
+                * (see above).
+                */
+               qp->s_ack_state = OP(SEND_ONLY);
+               qp->s_flags &= ~RVT_S_ACK_PENDING;
+               qp->s_cur_sge = NULL;
+               if (qp->s_nak_state)
+                       ohdr->u.aeth =
+                               cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->s_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+               else
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+               hwords++;
+               len = 0;
+               bth0 = OP(ACKNOWLEDGE) << 24;
+               bth2 = mask_psn(qp->s_ack_psn);
+       }
+       qp->s_rdma_ack_cnt++;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+bail:
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       /*
+        * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+        * RVT_S_RESP_PENDING
+        */
+       smp_wmb();
+       qp->s_flags &= ~(RVT_S_RESP_PENDING
+                               | RVT_S_ACK_PENDING
+                               | RVT_S_AHG_VALID);
+       return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Assumes s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_other_headers *ohdr;
+       struct rvt_sge_state *ss;
+       struct rvt_swqe *wqe;
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       u32 hwords = 5;
+       u32 len;
+       u32 bth0 = 0;
+       u32 bth2;
+       u32 pmtu = qp->pmtu;
+       char newreq;
+       int middle = 0;
+       int delta;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+       /* Sending responses has higher priority over sending requests. */
+       if ((qp->s_flags & RVT_S_RESP_PENDING) &&
+           make_rc_ack(dev, qp, ohdr, ps))
+               return 1;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+               /* will get called again */
+               goto done_free_tx;
+       }
+
+       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+               goto bail;
+
+       if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+               if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+                       qp->s_flags |= RVT_S_WAIT_PSN;
+                       goto bail;
+               }
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+       }
+
+       /* Send a request. */
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       switch (qp->s_state) {
+       default:
+               if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /*
+                * Resend an old request or start a new one.
+                *
+                * We keep track of the current SWQE so that
+                * we don't reset the "furthest progress" state
+                * if we need to back up.
+                */
+               newreq = 0;
+               if (qp->s_cur == qp->s_tail) {
+                       /* Check if send work queue is empty. */
+                       if (qp->s_tail == qp->s_head) {
+                               clear_ahg(qp);
+                               goto bail;
+                       }
+                       /*
+                        * If a fence is requested, wait for previous
+                        * RDMA read and atomic operations to finish.
+                        */
+                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                           qp->s_num_rd_atomic) {
+                               qp->s_flags |= RVT_S_WAIT_FENCE;
+                               goto bail;
+                       }
+                       newreq = 1;
+                       qp->s_psn = wqe->psn;
+               }
+               /*
+                * Note that we have to be careful not to modify the
+                * original work request since we may need to resend
+                * it.
+                */
+               len = wqe->length;
+               ss = &qp->s_sge;
+               bth2 = mask_psn(qp->s_psn);
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND) {
+                               qp->s_state = OP(SEND_ONLY);
+                       } else {
+                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                               qp->s_lsn++;
+                       /* FALLTHROUGH */
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / sizeof(u32);
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       qp->s_state = OP(RDMA_READ_REQUEST);
+                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                       }
+                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+                               qp->s_state = OP(COMPARE_SWAP);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->atomic_wr.swap);
+                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+                                       wqe->atomic_wr.compare_add);
+                       } else {
+                               qp->s_state = OP(FETCH_ADD);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->atomic_wr.compare_add);
+                               ohdr->u.atomic_eth.compare_data = 0;
+                       }
+                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+                               wqe->atomic_wr.remote_addr >> 32);
+                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+                               wqe->atomic_wr.remote_addr);
+                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
+                               wqe->atomic_wr.rkey);
+                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               qp->s_len = wqe->length;
+               if (newreq) {
+                       qp->s_tail++;
+                       if (qp->s_tail >= qp->s_size)
+                               qp->s_tail = 0;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       qp->s_psn = wqe->lpsn + 1;
+               else
+                       qp->s_psn++;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+                * thread to indicate a SEND needs to be restarted from an
+                * earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND) {
+                       qp->s_state = OP(SEND_LAST);
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+                * thread to indicate a RDMA write needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               } else {
+                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+                * thread to indicate a RDMA read needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+               ohdr->u.rc.reth.vaddr =
+                       cpu_to_be64(wqe->rdma_wr.remote_addr + len);
+               ohdr->u.rc.reth.rkey =
+                       cpu_to_be32(wqe->rdma_wr.rkey);
+               ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+               qp->s_state = OP(RDMA_READ_REQUEST);
+               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+               bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+               qp->s_psn = wqe->lpsn + 1;
+               ss = NULL;
+               len = 0;
+               qp->s_cur++;
+               if (qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_sending_hpsn = bth2;
+       delta = delta_psn(bth2, wqe->psn);
+       if (delta && delta % HFI1_PSN_CREDIT == 0)
+               bth2 |= IB_BTH_REQ_ACK;
+       if (qp->s_flags & RVT_S_SEND_ONE) {
+               qp->s_flags &= ~RVT_S_SEND_ONE;
+               qp->s_flags |= RVT_S_WAIT_ACK;
+               bth2 |= IB_BTH_REQ_ACK;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(
+               qp,
+               ohdr,
+               bth0 | (qp->s_state << 24),
+               bth2,
+               middle,
+               ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
+                     int is_fecn)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 pbc, pbc_flags = 0;
+       u16 lrh0;
+       u16 sc5;
+       u32 bth0;
+       u32 hwords;
+       u32 vl, plen;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+       unsigned long flags;
+
+       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+       if (qp->s_flags & RVT_S_RESP_PENDING)
+               goto queue_ack;
+
+       /* Ensure s_rdma_ack_cnt changes are committed */
+       smp_read_barrier_depends();
+       if (qp->s_rdma_ack_cnt)
+               goto queue_ack;
+
+       /* Construct the header */
+       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+       hwords = 6;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+                                      &qp->remote_ah_attr.grh, hwords, 0);
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+       /* read pkey_index w/o lock (its atomic) */
+       bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       if (qp->r_nak_state)
+               ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->r_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+       else
+               ohdr->u.aeth = hfi1_compute_aeth(qp);
+       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+       pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+       /* Don't try to send ACKs if the link isn't ACTIVE */
+       if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+               return;
+
+       sc = rcd->sc;
+       plen = 2 /* PBC */ + hwords;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+       if (!pbuf) {
+               /*
+                * We have no room to send at the moment.  Pass
+                * responsibility for sending the ACK to the send tasklet
+                * so that when enough buffer space becomes available,
+                * the ACK is sent ahead of other outgoing packets.
+                */
+               goto queue_ack;
+       }
+
+       trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+       /* write the pbc and data */
+       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+       return;
+
+queue_ack:
+       this_cpu_inc(*ibp->rvp.rc_qacks);
+       spin_lock_irqsave(&qp->s_lock, flags);
+       qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
+       qp->s_nak_state = qp->r_nak_state;
+       qp->s_ack_psn = qp->r_ack_psn;
+       if (is_fecn)
+               qp->s_flags |= RVT_S_ECN;
+
+       /* Schedule the send tasklet. */
+       hfi1_schedule_send(qp);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct rvt_qp *qp, u32 psn)
+{
+       u32 n = qp->s_acked;
+       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
+       u32 opcode;
+
+       qp->s_cur = n;
+
+       /*
+        * If we are starting the request from the beginning,
+        * let the normal send code handle initialization.
+        */
+       if (cmp_psn(psn, wqe->psn) <= 0) {
+               qp->s_state = OP(SEND_LAST);
+               goto done;
+       }
+
+       /* Find the work request opcode corresponding to the given PSN. */
+       opcode = wqe->wr.opcode;
+       for (;;) {
+               int diff;
+
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+               wqe = rvt_get_swqe_ptr(qp, n);
+               diff = cmp_psn(psn, wqe->psn);
+               if (diff < 0)
+                       break;
+               qp->s_cur = n;
+               /*
+                * If we are starting the request from the beginning,
+                * let the normal send code handle initialization.
+                */
+               if (diff == 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       goto done;
+               }
+               opcode = wqe->wr.opcode;
+       }
+
+       /*
+        * Set the state to restart in the middle of a request.
+        * Don't change the s_sge, s_cur_sge, or s_cur_size.
+        * See hfi1_make_rc_req().
+        */
+       switch (opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+               break;
+
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
+       case IB_WR_RDMA_READ:
+               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               break;
+
+       default:
+               /*
+                * This case shouldn't happen since its only
+                * one PSN per req.
+                */
+               qp->s_state = OP(SEND_LAST);
+       }
+done:
+       qp->s_psn = psn;
+       /*
+        * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
+        * asynchronously before the send tasklet can get scheduled.
+        * Doing it in hfi1_make_rc_req() is too late.
+        */
+       if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+           (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+               qp->s_flags |= RVT_S_WAIT_PSN;
+       qp->s_flags &= ~RVT_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
+{
+       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       struct hfi1_ibport *ibp;
+
+       if (qp->s_retry == 0) {
+               if (qp->s_mig_state == IB_MIG_ARMED) {
+                       hfi1_migrate_qp(qp);
+                       qp->s_retry = qp->s_retry_cnt;
+               } else if (qp->s_last == qp->s_acked) {
+                       hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+                       rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       return;
+               } else { /* need to handle delayed completion */
+                       return;
+               }
+       } else {
+               qp->s_retry--;
+       }
+
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               ibp->rvp.n_rc_resends++;
+       else
+               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+       qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
+                        RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
+                        RVT_S_WAIT_ACK);
+       if (wait)
+               qp->s_flags |= RVT_S_SEND_ONE;
+       reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+void hfi1_rc_timeout(unsigned long arg)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       struct hfi1_ibport *ibp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+       spin_lock(&qp->s_lock);
+       if (qp->s_flags & RVT_S_TIMER) {
+               ibp = to_iport(qp->ibqp.device, qp->port_num);
+               ibp->rvp.n_rc_timeouts++;
+               qp->s_flags &= ~RVT_S_TIMER;
+               del_timer(&qp->s_timer);
+               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
+               restart_rc(qp, qp->s_last_psn + 1, 1);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       hfi1_stop_rnr_timer(qp);
+       hfi1_schedule_send(qp);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
+{
+       struct rvt_swqe *wqe;
+       u32 n = qp->s_last;
+
+       /* Find the work request corresponding to the given PSN. */
+       for (;;) {
+               wqe = rvt_get_swqe_ptr(qp, n);
+               if (cmp_psn(psn, wqe->lpsn) <= 0) {
+                       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                               qp->s_sending_psn = wqe->lpsn + 1;
+                       else
+                               qp->s_sending_psn = psn + 1;
+                       break;
+               }
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+       }
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       struct rvt_swqe *wqe;
+       struct ib_wc wc;
+       unsigned i;
+       u32 opcode;
+       u32 psn;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       /* Find out where the BTH is */
+       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               WARN_ON(!qp->s_rdma_ack_cnt);
+               qp->s_rdma_ack_cnt--;
+               return;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       reset_sending_psn(qp, psn);
+
+       /*
+        * Start timer after a packet requesting an ACK has been sent and
+        * there are still requests that haven't been acked.
+        */
+       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+           !(qp->s_flags &
+               (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+               (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+               hfi1_add_retry_timer(qp);
+
+       while (qp->s_last != qp->s_acked) {
+               u32 s_last;
+
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+                   cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+                       break;
+               s_last = qp->s_last;
+               if (++s_last >= qp->s_size)
+                       s_last = 0;
+               qp->s_last = s_last;
+               /* see post_send() */
+               barrier();
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct rvt_sge *sge = &wqe->sg_list[i];
+
+                       rvt_put_mr(sge->mr);
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+               }
+       }
+       /*
+        * If we were waiting for sends to complete before re-sending,
+        * and they are now complete, restart sending.
+        */
+       trace_hfi1_rc_sendcomplete(qp, psn);
+       if (qp->s_flags & RVT_S_WAIT_PSN &&
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               qp->s_flags &= ~RVT_S_WAIT_PSN;
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+               hfi1_schedule_send(qp);
+       }
+}
+
+static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
+{
+       qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+                                        struct rvt_swqe *wqe,
+                                        struct hfi1_ibport *ibp)
+{
+       struct ib_wc wc;
+       unsigned i;
+
+       /*
+        * Don't decrement refcount and don't generate a
+        * completion if the SWQE is being resent until the send
+        * is finished.
+        */
+       if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               u32 s_last;
+
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct rvt_sge *sge = &wqe->sg_list[i];
+
+                       rvt_put_mr(sge->mr);
+               }
+               s_last = qp->s_last;
+               if (++s_last >= qp->s_size)
+                       s_last = 0;
+               qp->s_last = s_last;
+               /* see post_send() */
+               barrier();
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+               }
+       } else {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               this_cpu_inc(*ibp->rvp.rc_delayed_comp);
+               /*
+                * If send progress not running attempt to progress
+                * SDMA queue.
+                */
+               if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+                       struct sdma_engine *engine;
+                       u8 sc5;
+
+                       /* For now use sc to find engine */
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       engine = qp_to_sdma_engine(qp, sc5);
+                       sdma_engine_progress_schedule(engine);
+               }
+       }
+
+       qp->s_retry = qp->s_retry_cnt;
+       update_last_psn(qp, wqe->lpsn);
+
+       /*
+        * If we are completing a request which is in the process of
+        * being resent, we can stop re-sending it since we know the
+        * responder has already seen it.
+        */
+       if (qp->s_acked == qp->s_cur) {
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               qp->s_acked = qp->s_cur;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+               if (qp->s_acked != qp->s_tail) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               }
+       } else {
+               if (++qp->s_acked >= qp->s_size)
+                       qp->s_acked = 0;
+               if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+                       qp->s_draining = 0;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       }
+       return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * May be called at interrupt level, with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+                    u64 val, struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp;
+       enum ib_wc_status status;
+       struct rvt_swqe *wqe;
+       int ret = 0;
+       u32 ack_psn;
+       int diff;
+       unsigned long to;
+
+       /*
+        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+        * requests and implicitly NAK RDMA read and atomic requests issued
+        * before the NAK'ed request.  The MSN won't include the NAK'ed
+        * request but will include an ACK'ed request(s).
+        */
+       ack_psn = psn;
+       if (aeth >> 29)
+               ack_psn--;
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+       /*
+        * The MSN might be for a later WQE than the PSN indicates so
+        * only complete WQEs that the PSN finishes.
+        */
+       while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+               /*
+                * RDMA_READ_RESPONSE_ONLY is a special case since
+                * we want to generate completion events for everything
+                * before the RDMA read, copy the data, then generate
+                * the completion for the read.
+                */
+               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+                   diff == 0) {
+                       ret = 1;
+                       goto bail_stop;
+               }
+               /*
+                * If this request is a RDMA read or atomic, and the ACK is
+                * for a later operation, this ACK NAKs the RDMA read or
+                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+                * can ACK a RDMA read and likewise for atomic ops.  Note
+                * that the NAK case can only happen if relaxed ordering is
+                * used and requests are sent after an RDMA read or atomic
+                * is sent but before the response is received.
+                */
+               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+                       /* Retry this request. */
+                       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+                               qp->r_flags |= RVT_R_RDMAR_SEQ;
+                               restart_rc(qp, qp->s_last_psn + 1, 0);
+                               if (list_empty(&qp->rspwait)) {
+                                       qp->r_flags |= RVT_R_RSP_SEND;
+                                       atomic_inc(&qp->refcount);
+                                       list_add_tail(&qp->rspwait,
+                                                     &rcd->qp_wait_list);
+                               }
+                       }
+                       /*
+                        * No need to process the ACK/NAK since we are
+                        * restarting an earlier request.
+                        */
+                       goto bail_stop;
+               }
+               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+                       u64 *vaddr = wqe->sg_list[0].vaddr;
+                       *vaddr = val;
+               }
+               if (qp->s_num_rd_atomic &&
+                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+                       qp->s_num_rd_atomic--;
+                       /* Restart sending task if fence is complete */
+                       if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+                           !qp->s_num_rd_atomic) {
+                               qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+                                                RVT_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+                               qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
+                                                RVT_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       }
+               }
+               wqe = do_rc_completion(qp, wqe, ibp);
+               if (qp->s_acked == qp->s_tail)
+                       break;
+       }
+
+       switch (aeth >> 29) {
+       case 0:         /* ACK */
+               this_cpu_inc(*ibp->rvp.rc_acks);
+               if (qp->s_acked != qp->s_tail) {
+                       /*
+                        * We are expecting more ACKs so
+                        * mod the retry timer.
+                        */
+                       hfi1_mod_retry_timer(qp);
+                       /*
+                        * We can stop re-sending the earlier packets and
+                        * continue with the next packet the receiver wants.
+                        */
+                       if (cmp_psn(qp->s_psn, psn) <= 0)
+                               reset_psn(qp, psn + 1);
+               } else {
+                       /* No more acks - kill all timers */
+                       hfi1_stop_rc_timers(qp);
+                       if (cmp_psn(qp->s_psn, psn) <= 0) {
+                               qp->s_state = OP(SEND_LAST);
+                               qp->s_psn = psn + 1;
+                       }
+               }
+               if (qp->s_flags & RVT_S_WAIT_ACK) {
+                       qp->s_flags &= ~RVT_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+               hfi1_get_credit(qp, aeth);
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               qp->s_retry = qp->s_retry_cnt;
+               update_last_psn(qp, psn);
+               return 1;
+
+       case 1:         /* RNR NAK */
+               ibp->rvp.n_rnr_naks++;
+               if (qp->s_acked == qp->s_tail)
+                       goto bail_stop;
+               if (qp->s_flags & RVT_S_WAIT_RNR)
+                       goto bail_stop;
+               if (qp->s_rnr_retry == 0) {
+                       status = IB_WC_RNR_RETRY_EXC_ERR;
+                       goto class_b;
+               }
+               if (qp->s_rnr_retry_cnt < 7)
+                       qp->s_rnr_retry--;
+
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+
+               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+               reset_psn(qp, psn);
+
+               qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
+               hfi1_stop_rc_timers(qp);
+               to =
+                       ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                                          HFI1_AETH_CREDIT_MASK];
+               hfi1_add_rnr_timer(qp, to);
+               return 0;
+
+       case 3:         /* NAK */
+               if (qp->s_acked == qp->s_tail)
+                       goto bail_stop;
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+               switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                       HFI1_AETH_CREDIT_MASK) {
+               case 0: /* PSN sequence error */
+                       ibp->rvp.n_seq_naks++;
+                       /*
+                        * Back up to the responder's expected PSN.
+                        * Note that we might get a NAK in the middle of an
+                        * RDMA READ response which terminates the RDMA
+                        * READ.
+                        */
+                       restart_rc(qp, psn, 0);
+                       hfi1_schedule_send(qp);
+                       break;
+
+               case 1: /* Invalid Request */
+                       status = IB_WC_REM_INV_REQ_ERR;
+                       ibp->rvp.n_other_naks++;
+                       goto class_b;
+
+               case 2: /* Remote Access Error */
+                       status = IB_WC_REM_ACCESS_ERR;
+                       ibp->rvp.n_other_naks++;
+                       goto class_b;
+
+               case 3: /* Remote Operation Error */
+                       status = IB_WC_REM_OP_ERR;
+                       ibp->rvp.n_other_naks++;
+class_b:
+                       if (qp->s_last == qp->s_acked) {
+                               hfi1_send_complete(qp, wqe, status);
+                               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       }
+                       break;
+
+               default:
+                       /* Ignore other reserved NAK error codes */
+                       goto reserved;
+               }
+               qp->s_retry = qp->s_retry_cnt;
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               goto bail_stop;
+
+       default:                /* 2: reserved */
+reserved:
+               /* Ignore reserved NAK codes. */
+               goto bail_stop;
+       }
+       /* cannot be reached  */
+bail_stop:
+       hfi1_stop_rc_timers(qp);
+       return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+                        struct hfi1_ctxtdata *rcd)
+{
+       struct rvt_swqe *wqe;
+
+       /* Remove QP from retry timer */
+       hfi1_stop_rc_timers(qp);
+
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+       while (cmp_psn(psn, wqe->lpsn) > 0) {
+               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       break;
+               wqe = do_rc_completion(qp, wqe, ibp);
+       }
+
+       ibp->rvp.n_rdma_seq++;
+       qp->r_flags |= RVT_R_RDMAR_SEQ;
+       restart_rc(qp, qp->s_last_psn + 1, 0);
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= RVT_R_RSP_SEND;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+                       struct hfi1_other_headers *ohdr,
+                       void *data, u32 tlen, struct rvt_qp *qp,
+                       u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+                       struct hfi1_ctxtdata *rcd)
+{
+       struct rvt_swqe *wqe;
+       enum ib_wc_status status;
+       unsigned long flags;
+       int diff;
+       u32 pad;
+       u32 aeth;
+       u64 val;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       trace_hfi1_rc_ack(qp, psn);
+
+       /* Ignore invalid responses. */
+       smp_read_barrier_depends(); /* see post_one_send */
+       if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
+               goto ack_done;
+
+       /* Ignore duplicate responses. */
+       diff = cmp_psn(psn, qp->s_last_psn);
+       if (unlikely(diff <= 0)) {
+               /* Update credits for "ghost" ACKs */
+               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+                       if ((aeth >> 29) == 0)
+                               hfi1_get_credit(qp, aeth);
+               }
+               goto ack_done;
+       }
+
+       /*
+        * Skip everything other than the PSN we expect, if we are waiting
+        * for a reply to a restarted RDMA read or atomic op.
+        */
+       if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+               if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+                       goto ack_done;
+               qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+       }
+
+       if (unlikely(qp->s_acked == qp->s_tail))
+               goto ack_done;
+       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+       status = IB_WC_SUCCESS;
+
+       switch (opcode) {
+       case OP(ACKNOWLEDGE):
+       case OP(ATOMIC_ACKNOWLEDGE):
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+                       __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+                       val = ((u64)be32_to_cpu(p[0]) << 32) |
+                               be32_to_cpu(p[1]);
+               } else {
+                       val = 0;
+               }
+               if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
+                       goto ack_done;
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_middle;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /* no AETH, no ACK */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+read_middle:
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto ack_len_err;
+               if (unlikely(pmtu >= qp->s_rdma_read_len))
+                       goto ack_len_err;
+
+               /*
+                * We got a response so update the timeout.
+                * 4.096 usec. * (1 << qp->timeout)
+                */
+               qp->s_flags |= RVT_S_TIMER;
+               mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+               if (qp->s_flags & RVT_S_WAIT_ACK) {
+                       qp->s_flags &= ~RVT_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+
+               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+                       qp->s_retry = qp->s_retry_cnt;
+
+               /*
+                * Update the RDMA receive state but do the copy w/o
+                * holding the locks and blocking interrupts.
+                */
+               qp->s_rdma_read_len -= pmtu;
+               update_last_psn(qp, psn);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
+               goto bail;
+
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+                       goto ack_done;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 0 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto ack_len_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_last;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /* ACKs READ req. */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 1 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen <= (hdrsize + pad + 4)))
+                       goto ack_len_err;
+read_last:
+               tlen -= hdrsize + pad + 4;
+               if (unlikely(tlen != qp->s_rdma_read_len))
+                       goto ack_len_err;
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
+               WARN_ON(qp->s_rdma_read_sge.num_sge);
+               (void)do_rc_ack(qp, aeth, psn,
+                                OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+               goto ack_done;
+       }
+
+ack_op_err:
+       status = IB_WC_LOC_QP_OP_ERR;
+       goto ack_err;
+
+ack_seq_err:
+       rdma_seq_err(qp, ibp, psn, rcd);
+       goto ack_done;
+
+ack_len_err:
+       status = IB_WC_LOC_LEN_ERR;
+ack_err:
+       if (qp->s_last == qp->s_acked) {
+               hfi1_send_complete(qp, wqe, status);
+               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+       }
+ack_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+       return;
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+                                 struct rvt_qp *qp)
+{
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= RVT_R_RSP_NAK;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+}
+
+static inline void rc_cancel_ack(struct rvt_qp *qp)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       priv->r_adefered = 0;
+       if (list_empty(&qp->rspwait))
+               return;
+       list_del_init(&qp->rspwait);
+       qp->r_flags &= ~RVT_R_RSP_NAK;
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+                                struct rvt_qp *qp, u32 opcode, u32 psn,
+                                int diff, struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct rvt_ack_entry *e;
+       unsigned long flags;
+       u8 i, prev;
+       int old_req;
+
+       trace_hfi1_rc_rcv_error(qp, psn);
+       if (diff > 0) {
+               /*
+                * Packet sequence error.
+                * A NAK will ACK earlier sends and RDMA writes.
+                * Don't queue the NAK if we already sent one.
+                */
+               if (!qp->r_nak_state) {
+                       ibp->rvp.n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence NAK until all packets
+                        * in the receive queue have been processed.
+                        * Otherwise, we end up propagating congestion.
+                        */
+                       rc_defered_ack(rcd, qp);
+               }
+               goto done;
+       }
+
+       /*
+        * Handle a duplicate request.  Don't re-execute SEND, RDMA
+        * write or atomic op.  Don't NAK errors, just silently drop
+        * the duplicate request.  Note that r_sge, r_len, and
+        * r_rcv_len may be in use so don't modify them.
+        *
+        * We are supposed to ACK the earliest duplicate PSN but we
+        * can coalesce an outstanding duplicate ACK.  We have to
+        * send the earliest so that RDMA reads can be restarted at
+        * the requester's expected PSN.
+        *
+        * First, find where this duplicate PSN falls within the
+        * ACKs previously sent.
+        * old_req is true if there is an older response that is scheduled
+        * to be sent before sending this one.
+        */
+       e = NULL;
+       old_req = 1;
+       ibp->rvp.n_rc_dupreq++;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       for (i = qp->r_head_ack_queue; ; i = prev) {
+               if (i == qp->s_tail_ack_queue)
+                       old_req = 0;
+               if (i)
+                       prev = i - 1;
+               else
+                       prev = HFI1_MAX_RDMA_ATOMIC;
+               if (prev == qp->r_head_ack_queue) {
+                       e = NULL;
+                       break;
+               }
+               e = &qp->s_ack_queue[prev];
+               if (!e->opcode) {
+                       e = NULL;
+                       break;
+               }
+               if (cmp_psn(psn, e->psn) >= 0) {
+                       if (prev == qp->s_tail_ack_queue &&
+                           cmp_psn(psn, e->lpsn) <= 0)
+                               old_req = 0;
+                       break;
+               }
+       }
+       switch (opcode) {
+       case OP(RDMA_READ_REQUEST): {
+               struct ib_reth *reth;
+               u32 offset;
+               u32 len;
+
+               /*
+                * If we didn't find the RDMA read request in the ack queue,
+                * we can ignore this request.
+                */
+               if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+                       goto unlock_done;
+               /* RETH comes after BTH */
+               reth = &ohdr->u.rc.reth;
+               /*
+                * Address range must be a subset of the original
+                * request and start on pmtu boundaries.
+                * We reuse the old ack_queue slot since the requester
+                * should not back up and request an earlier PSN for the
+                * same request.
+                */
+               offset = delta_psn(psn, e->psn) * qp->pmtu;
+               len = be32_to_cpu(reth->length);
+               if (unlikely(offset + len != e->rdma_sge.sge_length))
+                       goto unlock_done;
+               if (e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               if (len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+                                        IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto unlock_done;
+               } else {
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->psn = psn;
+               if (old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               /*
+                * If we didn't find the atomic request in the ack queue
+                * or the send tasklet is already backed up to send an
+                * earlier entry, we can ignore this request.
+                */
+               if (!e || e->opcode != (u8)opcode || old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       default:
+               /*
+                * Ignore this operation if it doesn't request an ACK
+                * or an earlier RDMA read or atomic is going to be resent.
+                */
+               if (!(psn & IB_BTH_REQ_ACK) || old_req)
+                       goto unlock_done;
+               /*
+                * Resend the most recent ACK if this request is
+                * after all the previous RDMA reads and atomics.
+                */
+               if (i == qp->r_head_ack_queue) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->r_psn - 1;
+                       goto send_ack;
+               }
+
+               /*
+                * Resend the RDMA read or atomic op which
+                * ACKs this duplicate request.
+                */
+               qp->s_tail_ack_queue = i;
+               break;
+       }
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       qp->s_flags |= RVT_S_RESP_PENDING;
+       qp->r_nak_state = 0;
+       hfi1_schedule_send(qp);
+
+unlock_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return 1;
+
+send_ack:
+       return 0;
+}
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
+{
+       unsigned long flags;
+       int lastwqe;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       lastwqe = rvt_error_qp(qp, err);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
+{
+       unsigned next;
+
+       next = n + 1;
+       if (next > HFI1_MAX_RDMA_ATOMIC)
+               next = 0;
+       qp->s_tail_ack_queue = next;
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+                         u32 lqpn, u32 rqpn, u8 svc_type)
+{
+       struct opa_hfi1_cong_log_event_internal *cc_event;
+       unsigned long flags;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       spin_lock_irqsave(&ppd->cc_log_lock, flags);
+
+       ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
+       ppd->threshold_event_counter++;
+
+       cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+       if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+               ppd->cc_log_idx = 0;
+       cc_event->lqpn = lqpn & RVT_QPN_MASK;
+       cc_event->rqpn = rqpn & RVT_QPN_MASK;
+       cc_event->sl = sl;
+       cc_event->svc_type = svc_type;
+       cc_event->rlid = rlid;
+       /* keep timestamp in units of 1.024 usec */
+       cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+       spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type)
+{
+       struct cca_timer *cca_timer;
+       u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+       u8 trigger_threshold;
+       struct cc_state *cc_state;
+       unsigned long flags;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       cc_state = get_cc_state(ppd);
+
+       if (!cc_state)
+               return;
+
+       /*
+        * 1) increase CCTI (for this SL)
+        * 2) select IPG (i.e., call set_link_ipg())
+        * 3) start timer
+        */
+       ccti_limit = cc_state->cct.ccti_limit;
+       ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+       trigger_threshold =
+               cc_state->cong_setting.entries[sl].trigger_threshold;
+
+       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+       cca_timer = &ppd->cca_timer[sl];
+       if (cca_timer->ccti < ccti_limit) {
+               if (cca_timer->ccti + ccti_incr <= ccti_limit)
+                       cca_timer->ccti += ccti_incr;
+               else
+                       cca_timer->ccti = ccti_limit;
+               set_link_ipg(ppd);
+       }
+
+       ccti = cca_timer->ccti;
+
+       if (!hrtimer_active(&cca_timer->hrtimer)) {
+               /* ccti_timer is in units of 1.024 usec */
+               unsigned long nsec = 1024 * ccti_timer;
+
+               hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+                             HRTIMER_MODE_REL);
+       }
+
+       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
+       if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+               log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * May be called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 bth0, opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       int diff;
+       struct ib_reth *reth;
+       unsigned long flags;
+       u32 bth1;
+       int ret, is_fecn = 0;
+       int copy_last = 0;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       u16 rlid = qp->remote_ah_attr.dlid;
+                       u32 lqpn, rqpn;
+
+                       lqpn = qp->ibqp.qp_num;
+                       rqpn = qp->remote_qpn;
+                       process_becn(
+                               ppd,
+                               qp->remote_ah_attr.sl,
+                               rlid, lqpn, rqpn,
+                               IB_CC_SVCTYPE_RC);
+               }
+               is_fecn = bth1 & HFI1_FECN_SMASK;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /*
+        * Process responses (ACKs) before anything else.  Note that the
+        * packet sequence number will be for something in the send work
+        * queue rather than the expected receive packet sequence number.
+        * In other words, this QP is the requester.
+        */
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+                           hdrsize, pmtu, rcd);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       /* Compute 24 bits worth of difference. */
+       diff = delta_psn(psn, qp->r_psn);
+       if (unlikely(diff)) {
+               if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+                       return;
+               goto send_ack;
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       default:
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       goto nack_inv;
+               /*
+                * Note that it is up to the requester to not send a new
+                * RDMA read or atomic operation before receiving an ACK
+                * for the previous operation.
+                */
+               break;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+       case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto nack_inv;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               /* consume RWQE */
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(RDMA_WRITE_LAST):
+               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+               /* fall through */
+       case OP(SEND_LAST):
+no_immediate_data:
+               wc.wc_flags = 0;
+               wc.ex.imm_data = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (bth0 >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto nack_inv;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
+               rvt_put_ss(&qp->r_sge);
+               qp->r_msn++;
+               if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+                       break;
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               else
+                       wc.opcode = IB_WC_RECV;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                            (bth0 & IB_BTH_SOLICITED) != 0);
+               break;
+
+       case OP(RDMA_WRITE_ONLY):
+               copy_last = 1;
+               /* fall through */
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_inv;
+               /* consume RWQE */
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+                                        rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto nack_acc;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_FIRST))
+                       goto send_middle;
+               else if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto no_immediate_data;
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+
+       case OP(RDMA_READ_REQUEST): {
+               struct rvt_ack_entry *e;
+               u32 len;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               reth = &ohdr->u.rc.reth;
+               len = be32_to_cpu(reth->length);
+               if (len) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+                                        rkey, IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto nack_acc_unlck;
+                       /*
+                        * Update the next expected PSN.  We add 1 later
+                        * below, so only add the remainder here.
+                        */
+                       if (len > pmtu)
+                               qp->r_psn += (len - 1) / pmtu;
+               } else {
+                       e->rdma_sge.mr = NULL;
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = qp->r_psn;
+               /*
+                * We need to increment the MSN here instead of when we
+                * finish sending the result since a duplicate request would
+                * increment it more than once.
+                */
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= RVT_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               struct ib_atomic_eth *ateth;
+               struct rvt_ack_entry *e;
+               u64 vaddr;
+               atomic64_t *maddr;
+               u64 sdata;
+               u32 rkey;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       rvt_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               ateth = &ohdr->u.atomic_eth;
+               vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) |
+                       be32_to_cpu(ateth->vaddr[1]);
+               if (unlikely(vaddr & (sizeof(u64) - 1)))
+                       goto nack_inv_unlck;
+               rkey = be32_to_cpu(ateth->rkey);
+               /* Check rkey & NAK */
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                         vaddr, rkey,
+                                         IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc_unlck;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+               sdata = be64_to_cpu(ateth->swap_data);
+               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+                       (u64)atomic64_add_return(sdata, maddr) - sdata :
+                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+                                     be64_to_cpu(ateth->compare_data),
+                                     sdata);
+               rvt_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = psn;
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= RVT_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       default:
+               /* NAK unknown opcodes. */
+               goto nack_inv;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       qp->r_ack_psn = psn;
+       qp->r_nak_state = 0;
+       /* Send an ACK if requested or required. */
+       if (psn & IB_BTH_REQ_ACK) {
+               struct hfi1_qp_priv *priv = qp->priv;
+
+               if (packet->numpkt == 0) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               if (priv->r_adefered >= HFI1_PSN_CREDIT) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               if (unlikely(is_fecn)) {
+                       rc_cancel_ack(qp);
+                       goto send_ack;
+               }
+               priv->r_adefered++;
+               rc_defered_ack(rcd, qp);
+       }
+       return;
+
+rnr_nak:
+       qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue RNR NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_inv_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       rc_defered_ack(rcd, qp);
+       return;
+
+nack_acc_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+       hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+send_ack:
+       hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct rvt_qp *qp)
+{
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       int diff;
+       u32 opcode;
+       u32 psn, bth0;
+
+       /* Check for GRH */
+       ohdr = &hdr->u.oth;
+       if (has_grh)
+               ohdr = &hdr->u.l.oth;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+               return;
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /* Only deal with RDMA Writes for now */
+       if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+               diff = delta_psn(psn, qp->r_psn);
+               if (!qp->r_nak_state && diff >= 0) {
+                       ibp->rvp.n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence
+                        * NAK until all packets
+                        * in the receive queue have
+                        * been processed.
+                        * Otherwise, we end up
+                        * propagating congestion.
+                        */
+                       rc_defered_ack(rcd, qp);
+               } /* Out of sequence NAK */
+       } /* QP Request NAKs */
+}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
new file mode 100644 (file)
index 0000000..a659aec
--- /dev/null
@@ -0,0 +1,979 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+       655360, /* 00: 655.36 */
+       10,     /* 01:    .01 */
+       20,     /* 02     .02 */
+       30,     /* 03:    .03 */
+       40,     /* 04:    .04 */
+       60,     /* 05:    .06 */
+       80,     /* 06:    .08 */
+       120,    /* 07:    .12 */
+       160,    /* 08:    .16 */
+       240,    /* 09:    .24 */
+       320,    /* 0A:    .32 */
+       480,    /* 0B:    .48 */
+       640,    /* 0C:    .64 */
+       960,    /* 0D:    .96 */
+       1280,   /* 0E:   1.28 */
+       1920,   /* 0F:   1.92 */
+       2560,   /* 10:   2.56 */
+       3840,   /* 11:   3.84 */
+       5120,   /* 12:   5.12 */
+       7680,   /* 13:   7.68 */
+       10240,  /* 14:  10.24 */
+       15360,  /* 15:  15.36 */
+       20480,  /* 16:  20.48 */
+       30720,  /* 17:  30.72 */
+       40960,  /* 18:  40.96 */
+       61440,  /* 19:  61.44 */
+       81920,  /* 1A:  81.92 */
+       122880, /* 1B: 122.88 */
+       163840, /* 1C: 163.84 */
+       245760, /* 1D: 245.76 */
+       327680, /* 1E: 327.68 */
+       491520  /* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+       int i, j, ret;
+       struct ib_wc wc;
+       struct rvt_lkey_table *rkt;
+       struct rvt_pd *pd;
+       struct rvt_sge_state *ss;
+
+       rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
+       pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+       ss = &qp->r_sge;
+       ss->sg_list = qp->r_sg_list;
+       qp->r_len = 0;
+       for (i = j = 0; i < wqe->num_sge; i++) {
+               if (wqe->sg_list[i].length == 0)
+                       continue;
+               /* Check LKEY */
+               if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+                                &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+                       goto bad_lkey;
+               qp->r_len += wqe->sg_list[i].length;
+               j++;
+       }
+       ss->num_sge = j;
+       ss->total_len = qp->r_len;
+       ret = 1;
+       goto bail;
+
+bad_lkey:
+       while (j) {
+               struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+               rvt_put_mr(sge->mr);
+       }
+       ss->num_sge = 0;
+       memset(&wc, 0, sizeof(wc));
+       wc.wr_id = wqe->wr_id;
+       wc.status = IB_WC_LOC_PROT_ERR;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       /* Signal solicited completion event. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
+{
+       unsigned long flags;
+       struct rvt_rq *rq;
+       struct rvt_rwq *wq;
+       struct rvt_srq *srq;
+       struct rvt_rwqe *wqe;
+       void (*handler)(struct ib_event *, void *);
+       u32 tail;
+       int ret;
+
+       if (qp->ibqp.srq) {
+               srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+               handler = srq->ibsrq.event_handler;
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               handler = NULL;
+               rq = &qp->r_rq;
+       }
+
+       spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
+       wq = rq->wq;
+       tail = wq->tail;
+       /* Validate tail before using it since it is user writable. */
+       if (tail >= rq->size)
+               tail = 0;
+       if (unlikely(tail == wq->head)) {
+               ret = 0;
+               goto unlock;
+       }
+       /* Make sure entry is read after head index is read. */
+       smp_rmb();
+       wqe = rvt_get_rwqe_ptr(rq, tail);
+       /*
+        * Even though we update the tail index in memory, the verbs
+        * consumer is not supposed to post more entries until a
+        * completion is generated.
+        */
+       if (++tail >= rq->size)
+               tail = 0;
+       wq->tail = tail;
+       if (!wr_id_only && !init_sge(qp, wqe)) {
+               ret = -1;
+               goto unlock;
+       }
+       qp->r_wr_id = wqe->wr_id;
+
+       ret = 1;
+       set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+       if (handler) {
+               u32 n;
+
+               /*
+                * Validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               n = wq->head;
+               if (n >= rq->size)
+                       n = 0;
+               if (n < tail)
+                       n += rq->size - tail;
+               else
+                       n -= tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       handler(&ev, srq->ibsrq.srq_context);
+                       goto bail;
+               }
+       }
+unlock:
+       spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+       return ret;
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+       if (!index) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               return cpu_to_be64(ppd->guid);
+       }
+       return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+       return (gid->global.interface_id == id &&
+               (gid->global.subnet_prefix == gid_prefix ||
+                gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct rvt_qp *qp, u32 bth0)
+{
+       __be64 guid;
+       unsigned long flags;
+       u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+       if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+               if (!has_grh) {
+                       if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+                                   guid))
+                               goto err;
+                       if (!gid_ok(
+                               &hdr->u.l.grh.sgid,
+                               qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+                               qp->alt_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+                       goto err;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_migrate_qp(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else {
+               if (!has_grh) {
+                       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp,
+                                        qp->remote_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+                                   guid))
+                               goto err;
+                       if (!gid_ok(
+                            &hdr->u.l.grh.sgid,
+                            qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+                            qp->remote_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->port_num)
+                       goto err;
+               if (qp->s_mig_state == IB_MIG_REARM &&
+                   !(bth0 & IB_BTH_MIG_REQ))
+                       qp->s_mig_state = IB_MIG_ARMED;
+       }
+
+       return 0;
+
+err:
+       return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct rvt_qp *sqp)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct rvt_qp *qp;
+       struct rvt_swqe *wqe;
+       struct rvt_sge *sge;
+       unsigned long flags;
+       struct ib_wc wc;
+       u64 sdata;
+       atomic64_t *maddr;
+       enum ib_wc_status send_status;
+       int release;
+       int ret;
+       int copy_last = 0;
+       u32 to;
+
+       rcu_read_lock();
+
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
+       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+                           sqp->remote_qpn);
+
+       spin_lock_irqsave(&sqp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+           !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
+
+       sqp->s_flags |= RVT_S_BUSY;
+
+again:
+       smp_read_barrier_depends(); /* see post_one_send() */
+       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+               goto clr_busy;
+       wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work request. */
+       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
+       }
+
+       /*
+        * We can rely on the entry not changing without the s_lock
+        * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
+        */
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+       if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+           qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+               ibp->rvp.n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
+       memset(&wc, 0, sizeof(wc));
+       send_status = IB_WC_SUCCESS;
+
+       release = 1;
+       sqp->s_sge.sge = wqe->sg_list[0];
+       sqp->s_sge.sg_list = wqe->sg_list + 1;
+       sqp->s_sge.num_sge = wqe->wr.num_sge;
+       sqp->s_len = wqe->length;
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND_WITH_IMM:
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               /* FALLTHROUGH */
+       case IB_WR_SEND:
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               break;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               /* skip copy_last set and qp_access_flags recheck */
+               goto do_write;
+       case IB_WR_RDMA_WRITE:
+               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+do_write:
+               if (wqe->length == 0)
+                       break;
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+                                         wqe->rdma_wr.remote_addr,
+                                         wqe->rdma_wr.rkey,
+                                         IB_ACCESS_REMOTE_WRITE)))
+                       goto acc_err;
+               qp->r_sge.sg_list = NULL;
+               qp->r_sge.num_sge = 1;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_RDMA_READ:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto inv_err;
+               if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+                                         wqe->rdma_wr.remote_addr,
+                                         wqe->rdma_wr.rkey,
+                                         IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               release = 0;
+               sqp->s_sge.sg_list = NULL;
+               sqp->s_sge.num_sge = 1;
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->wr.num_sge;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto inv_err;
+               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                         wqe->atomic_wr.remote_addr,
+                                         wqe->atomic_wr.rkey,
+                                         IB_ACCESS_REMOTE_ATOMIC)))
+                       goto acc_err;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+               sdata = wqe->atomic_wr.compare_add;
+               *(u64 *)sqp->s_sge.sge.vaddr =
+                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+                       (u64)atomic64_add_return(sdata, maddr) - sdata :
+                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+                                     sdata, wqe->atomic_wr.swap);
+               rvt_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               goto send_comp;
+
+       default:
+               send_status = IB_WC_LOC_QP_OP_ERR;
+               goto serr;
+       }
+
+       sge = &sqp->s_sge.sge;
+       while (sqp->s_len) {
+               u32 len = sqp->s_len;
+
+               if (len > sge->length)
+                       len = sge->length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (!release)
+                               rvt_put_mr(sge->mr);
+                       if (--sqp->s_sge.num_sge)
+                               *sge = *sqp->s_sge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               sqp->s_len -= len;
+       }
+       if (release)
+               rvt_put_ss(&qp->r_sge);
+
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               goto send_comp;
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+       else
+               wc.opcode = IB_WC_RECV;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.byte_len = wqe->length;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = qp->remote_qpn;
+       wc.slid = qp->remote_ah_attr.dlid;
+       wc.sl = qp->remote_ah_attr.sl;
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       ibp->rvp.n_loop_pkts++;
+flush_send:
+       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+       hfi1_send_complete(sqp, wqe, send_status);
+       goto again;
+
+rnr_nak:
+       /* Handle RNR NAK */
+       if (qp->ibqp.qp_type == IB_QPT_UC)
+               goto send_comp;
+       ibp->rvp.n_rnr_naks++;
+       /*
+        * Note: we don't need the s_lock held since the BUSY flag
+        * makes this single threaded.
+        */
+       if (sqp->s_rnr_retry == 0) {
+               send_status = IB_WC_RNR_RETRY_EXC_ERR;
+               goto serr;
+       }
+       if (sqp->s_rnr_retry_cnt < 7)
+               sqp->s_rnr_retry--;
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+               goto clr_busy;
+       to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
+       hfi1_add_rnr_timer(sqp, to);
+       goto clr_busy;
+
+op_err:
+       send_status = IB_WC_REM_OP_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+inv_err:
+       send_status = IB_WC_REM_INV_REQ_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+acc_err:
+       send_status = IB_WC_REM_ACCESS_ERR;
+       wc.status = IB_WC_LOC_PROT_ERR;
+err:
+       /* responder goes to error state */
+       hfi1_rc_error(qp, wc.status);
+
+serr:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       hfi1_send_complete(sqp, wqe, send_status);
+       if (sqp->ibqp.qp_type == IB_QPT_RC) {
+               int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+               sqp->s_flags &= ~RVT_S_BUSY;
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               if (lastwqe) {
+                       struct ib_event ev;
+
+                       ev.device = sqp->ibqp.device;
+                       ev.element.qp = &sqp->ibqp;
+                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+               }
+               goto done;
+       }
+clr_busy:
+       sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+       hdr->version_tclass_flow =
+               cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+                           (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+                           (grh->flow_label << IB_GRH_FLOW_SHIFT));
+       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+       hdr->next_hdr = IB_GRH_NEXT_HDR;
+       hdr->hop_limit = grh->hop_limit;
+       /* The SGID is 32-bit aligned. */
+       hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+       hdr->sgid.global.interface_id =
+               grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+               ibp->guids[grh->sgid_index - 1] :
+                       cpu_to_be64(ppd_from_ibp(ibp)->guid);
+       hdr->dgid = grh->dgid;
+
+       /* GRH header size in 32-bit words. */
+       return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_hdr
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
+               clear_ahg(qp);
+       if (!(qp->s_flags & RVT_S_AHG_VALID)) {
+               /* first middle that needs copy  */
+               if (qp->s_ahgidx < 0)
+                       qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
+               if (qp->s_ahgidx >= 0) {
+                       qp->s_ahgpsn = npsn;
+                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+                       /* save to protect a change in another thread */
+                       priv->s_hdr->sde = priv->s_sde;
+                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       qp->s_flags |= RVT_S_AHG_VALID;
+               }
+       } else {
+               /* subsequent middle after valid */
+               if (qp->s_ahgidx >= 0) {
+                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       priv->s_hdr->ahgcount++;
+                       priv->s_hdr->ahgdesc[0] =
+                               sdma_build_ahg_descriptor(
+                                       (__force u16)cpu_to_be16((u16)npsn),
+                                       BTH2_OFFSET,
+                                       16,
+                                       16);
+                       if ((npsn & 0xffff0000) !=
+                                       (qp->s_ahgpsn & 0xffff0000)) {
+                               priv->s_hdr->ahgcount++;
+                               priv->s_hdr->ahgdesc[1] =
+                                       sdma_build_ahg_descriptor(
+                                               (__force u16)cpu_to_be16(
+                                                       (u16)(npsn >> 16)),
+                                               BTH2_OFFSET,
+                                               0,
+                                               16);
+                       }
+               }
+       }
+}
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle,
+                         struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ibport *ibp = ps->ibp;
+       u16 lrh0;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth1;
+
+       /* Construct the header. */
+       extra_bytes = -qp->s_cur_size & 3;
+       nwords = (qp->s_cur_size + extra_bytes) >> 2;
+       lrh0 = HFI1_LRH_BTH;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               qp->s_hdrwords += hfi1_make_grh(ibp,
+                                               &ps->s_txreq->phdr.hdr.u.l.grh,
+                                               &qp->remote_ah_attr.grh,
+                                               qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               middle = 0;
+       }
+       lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       /*
+        * reset s_hdr/AHG fields
+        *
+        * This insures that the ahgentry/ahgcount
+        * are at a non-AHG default to protect
+        * build_verbs_tx_desc() from using
+        * an include ahgidx.
+        *
+        * build_ahg() will modify as appropriate
+        * to use the AHG feature.
+        */
+       priv->s_hdr->tx_flags = 0;
+       priv->s_hdr->ahgcount = 0;
+       priv->s_hdr->ahgidx = 0;
+       priv->s_hdr->sde = NULL;
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       else
+               middle = 0;
+       if (middle)
+               build_ahg(qp, bth2);
+       else
+               qp->s_flags &= ~RVT_S_AHG_VALID;
+       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       ps->s_txreq->phdr.hdr.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+                                      qp->remote_ah_attr.src_path_bits);
+       bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       bth1 = qp->remote_qpn;
+       if (qp->s_flags & RVT_S_ECN) {
+               qp->s_flags &= ~RVT_S_ECN;
+               /* we recently received a FECN, so return a BECN */
+               bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+       }
+       ohdr->bth[1] = cpu_to_be32(bth1);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/* when sending, force a reschedule every one of these periods */
+#define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
+
+void _hfi1_do_send(struct work_struct *work)
+{
+       struct iowait *wait = container_of(work, struct iowait, iowork);
+       struct rvt_qp *qp = iowait_to_qp(wait);
+
+       hfi1_do_send(qp);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct rvt_qp *qp)
+{
+       struct hfi1_pkt_state ps;
+       struct hfi1_qp_priv *priv = qp->priv;
+       int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+       unsigned long timeout;
+       unsigned long timeout_int;
+       int cpu;
+
+       ps.dev = to_idev(qp->ibqp.device);
+       ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+       ps.ppd = ppd_from_ibp(ps.ibp);
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+                                                               ) - 1)) ==
+                                ps.ppd->lid)) {
+                       ruc_loopback(qp);
+                       return;
+               }
+               make_req = hfi1_make_rc_req;
+               timeout_int = (qp->timeout_jiffies);
+               break;
+       case IB_QPT_UC:
+               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+                                                               ) - 1)) ==
+                                ps.ppd->lid)) {
+                       ruc_loopback(qp);
+                       return;
+               }
+               make_req = hfi1_make_uc_req;
+               timeout_int = SEND_RESCHED_TIMEOUT;
+               break;
+       default:
+               make_req = hfi1_make_ud_req;
+               timeout_int = SEND_RESCHED_TIMEOUT;
+       }
+
+       spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+       /* Return if we are already busy processing a work request. */
+       if (!hfi1_send_ok(qp)) {
+               spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+               return;
+       }
+
+       qp->s_flags |= RVT_S_BUSY;
+
+       timeout = jiffies + (timeout_int) / 8;
+       cpu = priv->s_sde ? priv->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+       /* insure a pre-built packet is handled  */
+       ps.s_txreq = get_waiting_verbs_txreq(qp);
+       do {
+               /* Check for a constructed packet to be sent. */
+               if (qp->s_hdrwords != 0) {
+                       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+                       /*
+                        * If the packet cannot be sent now, return and
+                        * the send tasklet will be woken up later.
+                        */
+                       if (hfi1_verbs_send(qp, &ps))
+                               return;
+                       /* Record that s_hdr is empty. */
+                       qp->s_hdrwords = 0;
+                       /* allow other tasks to run */
+                       if (unlikely(time_after(jiffies, timeout))) {
+                               if (workqueue_congested(cpu,
+                                                       ps.ppd->hfi1_wq)) {
+                                       spin_lock_irqsave(
+                                               &qp->s_lock,
+                                               ps.flags);
+                                       qp->s_flags &= ~RVT_S_BUSY;
+                                       hfi1_schedule_send(qp);
+                                       spin_unlock_irqrestore(
+                                               &qp->s_lock,
+                                               ps.flags);
+                                       this_cpu_inc(
+                                               *ps.ppd->dd->send_schedule);
+                                       return;
+                               }
+                               if (!irqs_disabled()) {
+                                       cond_resched();
+                                       this_cpu_inc(
+                                          *ps.ppd->dd->send_schedule);
+                               }
+                               timeout = jiffies + (timeout_int) / 8;
+                       }
+                       spin_lock_irqsave(&qp->s_lock, ps.flags);
+               }
+       } while (make_req(qp, &ps));
+
+       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+                       enum ib_wc_status status)
+{
+       u32 old_last, last;
+       unsigned i;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       last = qp->s_last;
+       old_last = last;
+       if (++last >= qp->s_size)
+               last = 0;
+       qp->s_last = last;
+       /* See post_send() */
+       barrier();
+       for (i = 0; i < wqe->wr.num_sge; i++) {
+               struct rvt_sge *sge = &wqe->sg_list[i];
+
+               rvt_put_mr(sge->mr);
+       }
+       if (qp->ibqp.qp_type == IB_QPT_UD ||
+           qp->ibqp.qp_type == IB_QPT_SMI ||
+           qp->ibqp.qp_type == IB_QPT_GSI)
+               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+
+       /* See ch. 11.2.4.1 and 10.7.3.1 */
+       if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           status != IB_WC_SUCCESS) {
+               struct ib_wc wc;
+
+               memset(&wc, 0, sizeof(wc));
+               wc.wr_id = wqe->wr.wr_id;
+               wc.status = status;
+               wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+               wc.qp = &qp->ibqp;
+               if (status == IB_WC_SUCCESS)
+                       wc.byte_len = wqe->length;
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
+                            status != IB_WC_SUCCESS);
+       }
+
+       if (qp->s_acked == old_last)
+               qp->s_acked = last;
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
new file mode 100644 (file)
index 0000000..f9befc0
--- /dev/null
@@ -0,0 +1,3054 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 2048
+#define SDMA_DESC_INTR 64
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+static uint sdma_desct_intr = SDMA_DESC_INTR;
+module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+       (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
+#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
+#define SDMA_SENDCTRL_OP_HALT      BIT(2)
+#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+       [sdma_state_s00_hw_down]                = "s00_HwDown",
+       [sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
+       [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+       [sdma_state_s20_idle]                   = "s20_Idle",
+       [sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
+       [sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
+       [sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
+       [sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
+       [sdma_state_s80_hw_freeze]              = "s80_HwFreeze",
+       [sdma_state_s82_freeze_sw_clean]        = "s82_FreezeSwClean",
+       [sdma_state_s99_running]                = "s99_Running",
+};
+
+#ifdef CONFIG_SDMA_VERBOSITY
+static const char * const sdma_event_names[] = {
+       [sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+       [sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+       [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+       [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+       [sdma_event_e30_go_running]   = "e30_GoRunning",
+       [sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+       [sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+       [sdma_event_e60_hw_halted]    = "e60_HwHalted",
+       [sdma_event_e70_go_idle]      = "e70_GoIdle",
+       [sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
+       [sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
+       [sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
+       [sdma_event_e85_link_down]    = "e85_LinkDown",
+       [sdma_event_e90_sw_halted]    = "e90_SwHalted",
+};
+#endif
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+       [sdma_state_s00_hw_down] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s10_hw_start_up_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s15_hw_start_up_clean_wait] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s20_idle] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s30_sw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s40_hw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s50_hw_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s60_idle_halt_wait] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s80_hw_freeze] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s82_freeze_sw_clean] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s99_running] = {
+               .op_enable = 1,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+               .go_s99_running_totrue = 1,
+       },
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void __sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+       return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+       kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+       struct sdma_state *ss =
+               container_of(kref, struct sdma_state, kref);
+
+       complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+       kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+       sdma_put(ss);
+       wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0,
+       u64 value)
+{
+       write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0)
+{
+       return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+                                       int pause)
+{
+       u64 off = 8 * sde->this_idx;
+       struct hfi1_devdata *dd = sde->dd;
+       int lcnt = 0;
+       u64 reg_prev;
+       u64 reg = 0;
+
+       while (1) {
+               reg_prev = reg;
+               reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+               reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+               reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+               if (reg == 0)
+                       break;
+               /* counter is reest if accupancy count changes */
+               if (reg != reg_prev)
+                       lcnt = 0;
+               if (lcnt++ > 500) {
+                       /* timed out - bounce the link */
+                       dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+                                  __func__, sde->this_idx, (u32)reg);
+                       queue_work(dd->pport->hfi1_wq,
+                                  &dd->pport->link_bounce_work);
+                       break;
+               }
+               udelay(1);
+       }
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               struct sdma_engine *sde = &dd->per_sdma[i];
+
+               sdma_wait_for_packet_egress(sde, 0);
+       }
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+       u64 reg;
+
+       if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+               return;
+       reg = cnt;
+       reg &= SD(DESC_CNT_CNT_MASK);
+       reg <<= SD(DESC_CNT_CNT_SHIFT);
+       write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+static inline void complete_tx(struct sdma_engine *sde,
+                              struct sdma_txreq *tx,
+                              int res)
+{
+       /* protect against complete modifying */
+       struct iowait *wait = tx->wait;
+       callback_t complete = tx->complete;
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       trace_hfi1_sdma_out_sn(sde, tx->sn);
+       if (WARN_ON_ONCE(sde->head_sn != tx->sn))
+               dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                          sde->head_sn, tx->sn);
+       sde->head_sn++;
+#endif
+       sdma_txclean(sde->dd, tx);
+       if (complete)
+               (*complete)(tx, res);
+       if (wait && iowait_sdma_dec(wait))
+               iowait_drain_wakeup(wait);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ *   state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+       struct sdma_txreq *txp, *txp_next;
+       LIST_HEAD(flushlist);
+       unsigned long flags;
+
+       /* flush from head to tail */
+       sdma_flush_descq(sde);
+       spin_lock_irqsave(&sde->flushlist_lock, flags);
+       /* copy flush list */
+       list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+               list_del_init(&txp->list);
+               list_add_tail(&txp->list, &flushlist);
+       }
+       spin_unlock_irqrestore(&sde->flushlist_lock, flags);
+       /* flush from flush list */
+       list_for_each_entry_safe(txp, txp_next, &flushlist, list)
+               complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+       unsigned long flags;
+       struct sdma_engine *sde =
+               container_of(work, struct sdma_engine, flush_worker);
+
+       write_seqlock_irqsave(&sde->head_lock, flags);
+       if (!__sdma_running(sde))
+               sdma_flush(sde);
+       write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+       struct sdma_engine *sde = container_of(work, struct sdma_engine,
+                                               err_halt_worker);
+       u64 statuscsr;
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+       while (1) {
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+               if (statuscsr)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(sde->dd,
+                                  "SDMA engine %d - timeout waiting for engine to halt\n",
+                                  sde->this_idx);
+                       /*
+                        * Continue anyway.  This could happen if there was
+                        * an uncorrectable error in the wrong spot.
+                        */
+                       break;
+               }
+               usleep_range(80, 120);
+       }
+
+       sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+       if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+               unsigned index;
+               struct hfi1_devdata *dd = sde->dd;
+
+               for (index = 0; index < dd->num_sdma; index++) {
+                       struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+                       if (curr_sdma != sde)
+                               curr_sdma->progress_check_head =
+                                                       curr_sdma->descq_head;
+               }
+               dd_dev_err(sde->dd,
+                          "SDMA engine %d - check scheduled\n",
+                               sde->this_idx);
+               mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+       }
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+       unsigned index;
+       struct sdma_engine *sde = (struct sdma_engine *)data;
+
+       dd_dev_err(sde->dd, "SDE progress check event\n");
+       for (index = 0; index < sde->dd->num_sdma; index++) {
+               struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+               unsigned long flags;
+
+               /* check progress on each engine except the current one */
+               if (curr_sde == sde)
+                       continue;
+               /*
+                * We must lock interrupts when acquiring sde->lock,
+                * to avoid a deadlock if interrupt triggers and spins on
+                * the same lock on same CPU
+                */
+               spin_lock_irqsave(&curr_sde->tail_lock, flags);
+               write_seqlock(&curr_sde->head_lock);
+
+               /* skip non-running queues */
+               if (curr_sde->state.current_state != sdma_state_s99_running) {
+                       write_sequnlock(&curr_sde->head_lock);
+                       spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+                       continue;
+               }
+
+               if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+                   (curr_sde->descq_head ==
+                               curr_sde->progress_check_head))
+                       __sdma_process_event(curr_sde,
+                                            sdma_event_e90_sw_halted);
+               write_sequnlock(&curr_sde->head_lock);
+               spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+       }
+       schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *)opaque;
+       u64 statuscsr;
+
+       while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+               dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                          sde->this_idx, slashstrip(__FILE__), __LINE__,
+                       __func__);
+#endif
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+               if (statuscsr)
+                       break;
+               udelay(10);
+       }
+
+       sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+       smp_read_barrier_depends(); /* see sdma_update_tail() */
+       return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+       u16 head, tail;
+       int progress = 0;
+       struct sdma_txreq *txp = get_txhead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       while (head != tail) {
+               /* advance head, wrap if needed */
+               head = ++sde->descq_head & sde->sdma_mask;
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == head) {
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+                       trace_hfi1_sdma_progress(sde, head, tail, txp);
+                       txp = get_txhead(sde);
+               }
+               progress++;
+       }
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *)opaque;
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       /*
+        * At this point, the following should always be true:
+        * - We are halted, so no more descriptors are getting retired.
+        * - We are not running, so no one is submitting new work.
+        * - Only we can send the e40_sw_cleaned, so we can't start
+        *   running again until we say so.  So, the active list and
+        *   descq are ours to play with.
+        */
+
+       /*
+        * In the error clean up sequence, software clean must be called
+        * before the hardware clean so we can use the hardware head in
+        * the progress routine.  A hardware clean or SPC unfreeze will
+        * reset the hardware head.
+        *
+        * Process all retired requests. The progress routine will use the
+        * latest physical hardware head - we are not running so speed does
+        * not matter.
+        */
+       sdma_make_progress(sde, 0);
+
+       sdma_flush(sde);
+
+       /*
+        * Reset our notion of head and tail.
+        * Note that the HW registers have been reset via an earlier
+        * clean up.
+        */
+       sde->descq_tail = 0;
+       sde->descq_head = 0;
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       *sde->head_dma = 0;
+
+       __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+       struct sdma_state *ss = &sde->state;
+
+       /* Releasing this reference means the state machine has stopped. */
+       sdma_put(ss);
+
+       /* stop waiting for all unfreeze events to complete */
+       atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+       tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+                          enum sdma_states next_state)
+{
+       struct sdma_state *ss = &sde->state;
+       const struct sdma_set_state_action *action = sdma_action_table;
+       unsigned op = 0;
+
+       trace_hfi1_sdma_state(
+               sde,
+               sdma_state_names[ss->current_state],
+               sdma_state_names[next_state]);
+
+       /* debugging bookkeeping */
+       ss->previous_state = ss->current_state;
+       ss->previous_op = ss->current_op;
+       ss->current_state = next_state;
+
+       if (ss->previous_state != sdma_state_s99_running &&
+           next_state == sdma_state_s99_running)
+               sdma_flush(sde);
+
+       if (action[next_state].op_enable)
+               op |= SDMA_SENDCTRL_OP_ENABLE;
+
+       if (action[next_state].op_intenable)
+               op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+       if (action[next_state].op_halt)
+               op |= SDMA_SENDCTRL_OP_HALT;
+
+       if (action[next_state].op_cleanup)
+               op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+       if (action[next_state].go_s99_running_tofalse)
+               ss->go_s99_running = 0;
+
+       if (action[next_state].go_s99_running_totrue)
+               ss->go_s99_running = 1;
+
+       ss->current_op = op;
+       sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+       u16 count = sdma_descq_cnt;
+
+       if (!count)
+               return SDMA_DESCQ_CNT;
+       /* count must be a power of 2 greater than 64 and less than
+        * 32768.   Otherwise return default.
+        */
+       if (!is_power_of_2(count))
+               return SDMA_DESCQ_CNT;
+       if (count < 64 || count > 32768)
+               return SDMA_DESCQ_CNT;
+       return count;
+}
+
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl)
+{
+       struct sdma_vl_map *m;
+       struct sdma_map_elem *e;
+       struct sdma_engine *rval;
+
+       /* NOTE This should only happen if SC->VL changed after the initial
+        *      checks on the QP/AH
+        *      Default will return engine 0 below
+        */
+       if (vl >= num_vls) {
+               rval = NULL;
+               goto done;
+       }
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       if (unlikely(!m)) {
+               rcu_read_unlock();
+               return &dd->per_sdma[0];
+       }
+       e = m->map[vl & m->mask];
+       rval = e->sde[selector & e->mask];
+       rcu_read_unlock();
+
+done:
+       rval =  !rval ? &dd->per_sdma[0] : rval;
+       trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+       return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5)
+{
+       u8 vl = sc_to_vlt(dd, sc5);
+
+       return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+       int i;
+
+       for (i = 0; m && i < m->actual_vls; i++)
+               kfree(m->map[i]);
+       kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+       struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+       sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines.  Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+       int i, j;
+       int extra, sde_per_vl;
+       int engine = 0;
+       u8 lvl_engines[OPA_MAX_VLS];
+       struct sdma_vl_map *oldmap, *newmap;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return 0;
+
+       if (!vl_engines) {
+               /* truncate divide */
+               sde_per_vl = dd->num_sdma / num_vls;
+               /* extras */
+               extra = dd->num_sdma % num_vls;
+               vl_engines = lvl_engines;
+               /* add extras from last vl down */
+               for (i = num_vls - 1; i >= 0; i--, extra--)
+                       vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+       }
+       /* build new map */
+       newmap = kzalloc(
+               sizeof(struct sdma_vl_map) +
+                       roundup_pow_of_two(num_vls) *
+                       sizeof(struct sdma_map_elem *),
+               GFP_KERNEL);
+       if (!newmap)
+               goto bail;
+       newmap->actual_vls = num_vls;
+       newmap->vls = roundup_pow_of_two(num_vls);
+       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+       /* initialize back-map */
+       for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
+               newmap->engine_to_vl[i] = -1;
+       for (i = 0; i < newmap->vls; i++) {
+               /* save for wrap around */
+               int first_engine = engine;
+
+               if (i < newmap->actual_vls) {
+                       int sz = roundup_pow_of_two(vl_engines[i]);
+
+                       /* only allocate once */
+                       newmap->map[i] = kzalloc(
+                               sizeof(struct sdma_map_elem) +
+                                       sz * sizeof(struct sdma_engine *),
+                               GFP_KERNEL);
+                       if (!newmap->map[i])
+                               goto bail;
+                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+                       /* assign engines */
+                       for (j = 0; j < sz; j++) {
+                               newmap->map[i]->sde[j] =
+                                       &dd->per_sdma[engine];
+                               if (++engine >= first_engine + vl_engines[i])
+                                       /* wrap back to first engine */
+                                       engine = first_engine;
+                       }
+                       /* assign back-map */
+                       for (j = 0; j < vl_engines[i]; j++)
+                               newmap->engine_to_vl[first_engine + j] = i;
+               } else {
+                       /* just re-use entry without allocating */
+                       newmap->map[i] = newmap->map[i % num_vls];
+               }
+               engine = first_engine + vl_engines[i];
+       }
+       /* newmap in hand, save old map */
+       spin_lock_irq(&dd->sde_map_lock);
+       oldmap = rcu_dereference_protected(dd->sdma_map,
+                                          lockdep_is_held(&dd->sde_map_lock));
+
+       /* publish newmap */
+       rcu_assign_pointer(dd->sdma_map, newmap);
+
+       spin_unlock_irq(&dd->sde_map_lock);
+       /* success, free any old map after grace period */
+       if (oldmap)
+               call_rcu(&oldmap->list, sdma_map_rcu_callback);
+       return 0;
+bail:
+       /* free any partial allocation */
+       sdma_map_free(newmap);
+       return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+       size_t i;
+       struct sdma_engine *sde;
+
+       if (dd->sdma_pad_dma) {
+               dma_free_coherent(&dd->pcidev->dev, 4,
+                                 (void *)dd->sdma_pad_dma,
+                                 dd->sdma_pad_phys);
+               dd->sdma_pad_dma = NULL;
+               dd->sdma_pad_phys = 0;
+       }
+       if (dd->sdma_heads_dma) {
+               dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+                                 (void *)dd->sdma_heads_dma,
+                                 dd->sdma_heads_phys);
+               dd->sdma_heads_dma = NULL;
+               dd->sdma_heads_phys = 0;
+       }
+       for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+               sde = &dd->per_sdma[i];
+
+               sde->head_dma = NULL;
+               sde->head_phys = 0;
+
+               if (sde->descq) {
+                       dma_free_coherent(
+                               &dd->pcidev->dev,
+                               sde->descq_cnt * sizeof(u64[2]),
+                               sde->descq,
+                               sde->descq_phys
+                       );
+                       sde->descq = NULL;
+                       sde->descq_phys = 0;
+               }
+               kvfree(sde->tx_ring);
+               sde->tx_ring = NULL;
+       }
+       spin_lock_irq(&dd->sde_map_lock);
+       sdma_map_free(rcu_access_pointer(dd->sdma_map));
+       RCU_INIT_POINTER(dd->sdma_map, NULL);
+       spin_unlock_irq(&dd->sde_map_lock);
+       synchronize_rcu();
+       kfree(dd->per_sdma);
+       dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs.  Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+       u16 descq_cnt;
+       void *curr_head;
+       struct hfi1_pportdata *ppd = dd->pport + port;
+       u32 per_sdma_credits;
+       uint idle_cnt = sdma_idle_cnt;
+       size_t num_engines = dd->chip_sdma_engines;
+
+       if (!HFI1_CAP_IS_KSET(SDMA)) {
+               HFI1_CAP_CLEAR(SDMA_AHG);
+               return 0;
+       }
+       if (mod_num_sdma &&
+           /* can't exceed chip support */
+           mod_num_sdma <= dd->chip_sdma_engines &&
+           /* count must be >= vls */
+           mod_num_sdma >= num_vls)
+               num_engines = mod_num_sdma;
+
+       dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+       dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+       dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+                   dd->chip_sdma_mem_size);
+
+       per_sdma_credits =
+               dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
+
+       /* set up freeze waitqueue */
+       init_waitqueue_head(&dd->sdma_unfreeze_wq);
+       atomic_set(&dd->sdma_unfreeze_count, 0);
+
+       descq_cnt = sdma_get_descq_cnt();
+       dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+                   num_engines, descq_cnt);
+
+       /* alloc memory for array of send engines */
+       dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+       if (!dd->per_sdma)
+               return -ENOMEM;
+
+       idle_cnt = ns_to_cclock(dd, idle_cnt);
+       if (!sdma_desct_intr)
+               sdma_desct_intr = SDMA_DESC_INTR;
+
+       /* Allocate memory for SendDMA descriptor FIFOs */
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               sde = &dd->per_sdma[this_idx];
+               sde->dd = dd;
+               sde->ppd = ppd;
+               sde->this_idx = this_idx;
+               sde->descq_cnt = descq_cnt;
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               sde->sdma_shift = ilog2(descq_cnt);
+               sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+
+               /* Create a mask specifically for each interrupt source */
+               sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
+                                          this_idx);
+               sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
+                                               this_idx);
+               sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
+                                           this_idx);
+               /* Create a combined mask to cover all 3 interrupt sources */
+               sde->imask = sde->int_mask | sde->progress_mask |
+                            sde->idle_mask;
+
+               spin_lock_init(&sde->tail_lock);
+               seqlock_init(&sde->head_lock);
+               spin_lock_init(&sde->senddmactrl_lock);
+               spin_lock_init(&sde->flushlist_lock);
+               /* insure there is always a zero bit */
+               sde->ahg_bits = 0xfffffffe00000000ULL;
+
+               sdma_set_state(sde, sdma_state_s00_hw_down);
+
+               /* set up reference counting */
+               kref_init(&sde->state.kref);
+               init_completion(&sde->state.comp);
+
+               INIT_LIST_HEAD(&sde->flushlist);
+               INIT_LIST_HEAD(&sde->dmawait);
+
+               sde->tail_csr =
+                       get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+               if (idle_cnt)
+                       dd->default_desc1 =
+                               SDMA_DESC1_HEAD_TO_HOST_FLAG;
+               else
+                       dd->default_desc1 =
+                               SDMA_DESC1_INT_REQ_FLAG;
+
+               tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+                            (unsigned long)sde);
+
+               tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+                            (unsigned long)sde);
+               INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+               INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+               sde->progress_check_head = 0;
+
+               setup_timer(&sde->err_progress_check_timer,
+                           sdma_err_progress_check, (unsigned long)sde);
+
+               sde->descq = dma_zalloc_coherent(
+                       &dd->pcidev->dev,
+                       descq_cnt * sizeof(u64[2]),
+                       &sde->descq_phys,
+                       GFP_KERNEL
+               );
+               if (!sde->descq)
+                       goto bail;
+               sde->tx_ring =
+                       kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+                               GFP_KERNEL);
+               if (!sde->tx_ring)
+                       sde->tx_ring =
+                               vzalloc(
+                                       sizeof(struct sdma_txreq *) *
+                                       descq_cnt);
+               if (!sde->tx_ring)
+                       goto bail;
+       }
+
+       dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+       /* Allocate memory for DMA of head registers to memory */
+       dd->sdma_heads_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               dd->sdma_heads_size,
+               &dd->sdma_heads_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_heads_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+               goto bail;
+       }
+
+       /* Allocate memory for pad */
+       dd->sdma_pad_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               sizeof(u32),
+               &dd->sdma_pad_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_pad_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+               goto bail;
+       }
+
+       /* assign each engine to different cacheline and init registers */
+       curr_head = (void *)dd->sdma_heads_dma;
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               unsigned long phys_offset;
+
+               sde = &dd->per_sdma[this_idx];
+
+               sde->head_dma = curr_head;
+               curr_head += L1_CACHE_BYTES;
+               phys_offset = (unsigned long)sde->head_dma -
+                             (unsigned long)dd->sdma_heads_dma;
+               sde->head_phys = dd->sdma_heads_phys + phys_offset;
+               init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+       }
+       dd->flags |= HFI1_HAS_SEND_DMA;
+       dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+       dd->num_sdma = num_engines;
+       if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+               goto bail;
+       dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+       return 0;
+
+bail:
+       sdma_clean(dd, num_engines);
+       return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* move all engines to running */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e30_go_running);
+       }
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* idle all engines */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e70_go_idle);
+       }
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines.  Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       struct sdma_engine *sde;
+
+       /* kick off the engines state processing */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e10_go_hw_start);
+       }
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+
+       for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+                       ++this_idx) {
+               sde = &dd->per_sdma[this_idx];
+               if (!list_empty(&sde->dmawait))
+                       dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+                                  sde->this_idx);
+               sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+               del_timer_sync(&sde->err_progress_check_timer);
+
+               /*
+                * This waits for the state machine to exit so it is not
+                * necessary to kill the sdma_sw_clean_up_task to make sure
+                * it is not running.
+                */
+               sdma_finalput(&sde->state);
+       }
+       sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+       struct hfi1_devdata *dd,
+       struct sdma_desc *descp)
+{
+       switch (sdma_mapping_type(descp)) {
+       case SDMA_MAP_SINGLE:
+               dma_unmap_single(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       case SDMA_MAP_PAGE:
+               dma_unmap_page(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       }
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+       return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+               >> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx)
+{
+       u16 i;
+
+       if (tx->num_desc) {
+               u8 skip = 0, mode = ahg_mode(tx);
+
+               /* unmap first */
+               sdma_unmap_desc(dd, &tx->descp[0]);
+               /* determine number of AHG descriptors to skip */
+               if (mode > SDMA_AHG_APPLY_UPDATE1)
+                       skip = mode >> 1;
+               for (i = 1 + skip; i < tx->num_desc; i++)
+                       sdma_unmap_desc(dd, &tx->descp[i]);
+               tx->num_desc = 0;
+       }
+       kfree(tx->coalesce_buf);
+       tx->coalesce_buf = NULL;
+       /* kmalloc'ed descp */
+       if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+               tx->desc_limit = ARRAY_SIZE(tx->descs);
+               kfree(tx->descp);
+       }
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       int use_dmahead;
+       u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+       use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+                                       (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+       hwhead = use_dmahead ?
+               (u16)le64_to_cpu(*sde->head_dma) :
+               (u16)read_sde_csr(sde, SD(HEAD));
+
+       if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+               u16 cnt;
+               u16 swtail;
+               u16 swhead;
+               int sane;
+
+               swhead = sde->descq_head & sde->sdma_mask;
+               /* this code is really bad for cache line trading */
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               cnt = sde->descq_cnt;
+
+               if (swhead < swtail)
+                       /* not wrapped */
+                       sane = (hwhead >= swhead) & (hwhead <= swtail);
+               else if (swhead > swtail)
+                       /* wrapped around */
+                       sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+                               (hwhead <= swtail);
+               else
+                       /* empty */
+                       sane = (hwhead == swhead);
+
+               if (unlikely(!sane)) {
+                       dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+                                  sde->this_idx,
+                                  use_dmahead ? "dma" : "kreg",
+                                  hwhead, swhead, swtail, cnt);
+                       if (use_dmahead) {
+                               /* try one more time, using csr */
+                               use_dmahead = 0;
+                               goto retry;
+                       }
+                       /* proceed as if no progress */
+                       hwhead = swhead;
+               }
+       }
+       return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+       struct iowait *wait, *nw;
+       struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+       unsigned i, n = 0, seq;
+       struct sdma_txreq *stx;
+       struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&sde->dmawait)) {
+                       /* at least one item */
+                       write_seqlock(&dev->iowait_lock);
+                       /* Harvest waiters wanting DMA descriptors */
+                       list_for_each_entry_safe(
+                                       wait,
+                                       nw,
+                                       &sde->dmawait,
+                                       list) {
+                               u16 num_desc = 0;
+
+                               if (!wait->wakeup)
+                                       continue;
+                               if (n == ARRAY_SIZE(waits))
+                                       break;
+                               if (!list_empty(&wait->tx_head)) {
+                                       stx = list_first_entry(
+                                               &wait->tx_head,
+                                               struct sdma_txreq,
+                                               list);
+                                       num_desc = stx->num_desc;
+                               }
+                               if (num_desc > avail)
+                                       break;
+                               avail -= num_desc;
+                               list_del_init(&wait->list);
+                               waits[n++] = wait;
+                       }
+                       write_sequnlock(&dev->iowait_lock);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+
+       for (i = 0; i < n; i++)
+               waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+       struct sdma_txreq *txp = NULL;
+       int progress = 0;
+       u16 hwhead, swhead;
+       int idle_check_done = 0;
+
+       hwhead = sdma_gethead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+
+retry:
+       txp = get_txhead(sde);
+       swhead = sde->descq_head & sde->sdma_mask;
+       trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+       while (swhead != hwhead) {
+               /* advance head, wrap if needed */
+               swhead = ++sde->descq_head & sde->sdma_mask;
+
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == swhead) {
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       complete_tx(sde, txp, SDMA_TXREQ_S_OK);
+                       /* see if there is another txp */
+                       txp = get_txhead(sde);
+               }
+               trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+               progress++;
+       }
+
+       /*
+        * The SDMA idle interrupt is not guaranteed to be ordered with respect
+        * to updates to the the dma_head location in host memory. The head
+        * value read might not be fully up to date. If there are pending
+        * descriptors and the SDMA idle interrupt fired then read from the
+        * CSR SDMA head instead to get the latest value from the hardware.
+        * The hardware SDMA head should be read at most once in this invocation
+        * of sdma_make_progress(..) which is ensured by idle_check_done flag
+        */
+       if ((status & sde->idle_mask) && !idle_check_done) {
+               u16 swtail;
+
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               if (swtail != hwhead) {
+                       hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+                       idle_check_done = 1;
+                       goto retry;
+               }
+       }
+
+       sde->last_status = status;
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine.  It will
+ * contain bits _only_ for this SDMA engine.  It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+       trace_hfi1_sdma_engine_interrupt(sde, status);
+       write_seqlock(&sde->head_lock);
+       sdma_set_desc_cnt(sde, sdma_desct_intr);
+       if (status & sde->idle_mask)
+               sde->idle_int_cnt++;
+       else if (status & sde->progress_mask)
+               sde->progress_int_cnt++;
+       else if (status & sde->int_mask)
+               sde->sdma_int_cnt++;
+       sdma_make_progress(sde, status);
+       write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+                  sde->this_idx,
+                  (unsigned long long)status,
+                  sdma_state_names[sde->state.current_state]);
+#endif
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+       if (status & ALL_SDMA_ENG_HALT_ERRS)
+               __sdma_process_event(sde, sdma_event_e60_hw_halted);
+       if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+               dd_dev_err(sde->dd,
+                          "SDMA (%u) engine error: 0x%llx state %s\n",
+                          sde->this_idx,
+                          (unsigned long long)status,
+                          sdma_state_names[sde->state.current_state]);
+               dump_sdma_state(sde);
+       }
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+       u64 set_senddmactrl = 0;
+       u64 clr_senddmactrl = 0;
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+                  sde->this_idx,
+                  (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+       if (op & SDMA_SENDCTRL_OP_ENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_INTENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_HALT)
+               set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+       spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+       sde->p_senddmactrl |= set_senddmactrl;
+       sde->p_senddmactrl &= ~clr_senddmactrl;
+
+       if (op & SDMA_SENDCTRL_OP_CLEANUP)
+               write_sde_csr(sde, SD(CTRL),
+                             sde->p_senddmactrl |
+                             SD(CTRL_SDMA_CLEANUP_SMASK));
+       else
+               write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+       spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       /*
+        * Set SendDmaLenGen and clear-then-set the MSB of the generation
+        * count to enable generation checking and load the internal
+        * generation counter.
+        */
+       write_sde_csr(sde, SD(LEN_GEN),
+                     (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
+       write_sde_csr(sde, SD(LEN_GEN),
+                     ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
+                     (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+       /* Commit writes to memory and advance the tail on the chip */
+       smp_wmb(); /* see get_txhead() */
+       writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+       u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       *sde->head_dma = 0;
+
+       reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+             SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+       write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       u64 reg;
+
+       if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+               return;
+
+       reg = hfi1_pkt_base_sdma_integrity(dd);
+
+       if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+               CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+       else
+               SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+       write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+static void init_sdma_regs(
+       struct sdma_engine *sde,
+       u32 credits,
+       uint idle_cnt)
+{
+       u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct hfi1_devdata *dd = sde->dd;
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+       write_sde_csr(sde, SD(DESC_CNT), 0);
+       write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+       write_sde_csr(sde, SD(MEMORY),
+                     ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+                     ((u64)(credits * sde->this_idx) <<
+                      SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+       write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+       set_sdma_integrity(sde);
+       opmask = OPCODE_CHECK_MASK_DISABLED;
+       opval = OPCODE_CHECK_VAL_DISABLED;
+       write_sde_csr(sde, SD(CHECK_OPCODE),
+                     (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+                     (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+               csr = read_csr(sde->dd, reg); \
+               dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+               csr = read_sde_csr(sde, reg); \
+               dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+                       #reg, sde->this_idx, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+               csr = read_csr(sde->dd, reg + (8 * i)); \
+               dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
+                               #reg, i, csr); \
+       } while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+       u64 csr;
+       unsigned i;
+
+       sdma_dumpstate_helper(SD(CTRL));
+       sdma_dumpstate_helper(SD(STATUS));
+       sdma_dumpstate_helper0(SD(ERR_STATUS));
+       sdma_dumpstate_helper0(SD(ERR_MASK));
+       sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+       sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+               sdma_dumpstate_helper2(CCE_INT_STATUS);
+               sdma_dumpstate_helper2(CCE_INT_MASK);
+               sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+       }
+
+       sdma_dumpstate_helper(SD(TAIL));
+       sdma_dumpstate_helper(SD(HEAD));
+       sdma_dumpstate_helper(SD(PRIORITY_THLD));
+       sdma_dumpstate_helper(SD(IDLE_CNT));
+       sdma_dumpstate_helper(SD(RELOAD_CNT));
+       sdma_dumpstate_helper(SD(DESC_CNT));
+       sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+       sdma_dumpstate_helper(SD(MEMORY));
+       sdma_dumpstate_helper0(SD(ENGINES));
+       sdma_dumpstate_helper0(SD(MEM_SIZE));
+       /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
+       sdma_dumpstate_helper(SD(BASE_ADDR));
+       sdma_dumpstate_helper(SD(LEN_GEN));
+       sdma_dumpstate_helper(SD(HEAD_ADDR));
+       sdma_dumpstate_helper(SD(CHECK_ENABLE));
+       sdma_dumpstate_helper(SD(CHECK_VL));
+       sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+       sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+       sdma_dumpstate_helper(SD(CHECK_SLID));
+       sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+       struct hw_sdma_desc *descq;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+       u16 head, tail, cnt;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       cnt = sdma_descq_freecnt(sde);
+       descq = sde->descq;
+
+       dd_dev_err(sde->dd,
+                  "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+                  sde->this_idx, head, tail, cnt,
+                  !list_empty(&sde->flushlist));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               dd_dev_err(sde->dd,
+                          "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                          head, flags, addr, gen, len);
+               dd_dev_err(sde->dd,
+                          "\tdesc0:0x%016llx desc1 0x%016llx\n",
+                          desc[0], desc[1]);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       dd_dev_err(sde->dd,
+                                  "\taidx: %u amode: %u alen: %u\n",
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
+                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
+                                       SDMA_DESC1_HEADER_MODE_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_DWS_SMASK) >>
+                                       SDMA_DESC1_HEADER_DWS_SHIFT));
+               head++;
+               head &= sde->sdma_mask;
+       }
+}
+
+#define SDE_FMT \
+       "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+       u16 head, tail;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+       seq_printf(s, SDE_FMT, sde->this_idx,
+                  sde->cpu,
+                  sdma_state_name(sde->state.current_state),
+                  (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+                  (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+                  (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
+                  (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
+                  (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
+                  (unsigned long long)le64_to_cpu(*sde->head_dma),
+                  (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+                  (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+                  (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+                  (unsigned long long)sde->last_status,
+                  (unsigned long long)sde->ahg_bits,
+                  sde->tx_tail,
+                  sde->tx_head,
+                  sde->descq_tail,
+                  sde->descq_head,
+                  !list_empty(&sde->flushlist),
+                  sde->descq_full_count,
+                  (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               seq_printf(s,
+                          "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                          head, flags, addr, gen, len);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
+                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
+                                  (u8)((desc[1] &
+                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
+                                       SDMA_DESC1_HEADER_MODE_SHIFT));
+               head = (head + 1) & sde->sdma_mask;
+       }
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+       u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+       qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+       qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+                       << SDMA_DESC1_GENERATION_SHIFT;
+       return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+       int i;
+       u16 tail;
+       struct sdma_desc *descp = tx->descp;
+       u8 skip = 0, mode = ahg_mode(tx);
+
+       tail = sde->descq_tail & sde->sdma_mask;
+       sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+       sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+       trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+                                  tail, &sde->descq[tail]);
+       tail = ++sde->descq_tail & sde->sdma_mask;
+       descp++;
+       if (mode > SDMA_AHG_APPLY_UPDATE1)
+               skip = mode >> 1;
+       for (i = 1; i < tx->num_desc; i++, descp++) {
+               u64 qw1;
+
+               sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+               if (skip) {
+                       /* edits don't have generation */
+                       qw1 = descp->qw[1];
+                       skip--;
+               } else {
+                       /* replace generation with real one for non-edits */
+                       qw1 = add_gen(sde, descp->qw[1]);
+               }
+               sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+               trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+                                          tail, &sde->descq[tail]);
+               tail = ++sde->descq_tail & sde->sdma_mask;
+       }
+       tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+       WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+       sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+       sde->desc_avail -= tx->num_desc;
+       return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *tx)
+{
+       int ret;
+
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       if (tx->num_desc <= sde->desc_avail)
+               return -EAGAIN;
+       /* pulse the head_lock */
+       if (wait && wait->sleep) {
+               unsigned seq;
+
+               seq = raw_seqcount_begin(
+                       (const seqcount_t *)&sde->head_lock.seqcount);
+               ret = wait->sleep(sde, wait, tx, seq);
+               if (ret == -EAGAIN)
+                       sde->desc_avail = sdma_descq_freecnt(sde);
+       } else {
+               ret = -EBUSY;
+       }
+       return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring.  If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx)
+{
+       int ret = 0;
+       u16 tail;
+       unsigned long flags;
+
+       /* user should have supplied entire packet */
+       if (unlikely(tx->tlen))
+               return -EINVAL;
+       tx->wait = wait;
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       if (unlikely(!__sdma_running(sde)))
+               goto unlock_noconn;
+       if (unlikely(tx->num_desc > sde->desc_avail))
+               goto nodesc;
+       tail = submit_tx(sde, tx);
+       if (wait)
+               iowait_sdma_inc(wait);
+       sdma_update_tail(sde, tail);
+unlock:
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret;
+unlock_noconn:
+       if (wait)
+               iowait_sdma_inc(wait);
+       tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+       spin_lock(&sde->flushlist_lock);
+       list_add_tail(&tx->list, &sde->flushlist);
+       spin_unlock(&sde->flushlist_lock);
+       if (wait) {
+               wait->tx_count++;
+               wait->count += tx->num_desc;
+       }
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto unlock;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * > 0 - Success (value is number of sdma_txreq's submitted),
+ * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
+                    struct list_head *tx_list)
+{
+       struct sdma_txreq *tx, *tx_next;
+       int ret = 0;
+       unsigned long flags;
+       u16 tail = INVALID_TAIL;
+       int count = 0;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               if (unlikely(!__sdma_running(sde)))
+                       goto unlock_noconn;
+               if (unlikely(tx->num_desc > sde->desc_avail))
+                       goto nodesc;
+               if (unlikely(tx->tlen)) {
+                       ret = -EINVAL;
+                       goto update_tail;
+               }
+               list_del_init(&tx->list);
+               tail = submit_tx(sde, tx);
+               count++;
+               if (tail != INVALID_TAIL &&
+                   (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+                       sdma_update_tail(sde, tail);
+                       tail = INVALID_TAIL;
+               }
+       }
+update_tail:
+       if (wait)
+               iowait_sdma_add(wait, count);
+       if (tail != INVALID_TAIL)
+               sdma_update_tail(sde, tail);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret == 0 ? count : ret;
+unlock_noconn:
+       spin_lock(&sde->flushlist_lock);
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               list_del_init(&tx->list);
+               if (wait)
+                       iowait_sdma_inc(wait);
+               tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+               tx->sn = sde->tail_sn++;
+               trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+               list_add_tail(&tx->list, &sde->flushlist);
+               if (wait) {
+                       wait->tx_count++;
+                       wait->count += tx->num_desc;
+               }
+       }
+       spin_unlock(&sde->flushlist_lock);
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto update_tail;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       __sdma_process_event(sde, event);
+
+       if (sde->state.current_state == sdma_state_s99_running)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+                                enum sdma_events event)
+{
+       struct sdma_state *ss = &sde->state;
+       int need_progress = 0;
+
+       /* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+                  sdma_state_names[ss->current_state],
+                  sdma_event_names[event]);
+#endif
+
+       switch (ss->current_state) {
+       case sdma_state_s00_hw_down:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       break;
+               case sdma_event_e30_go_running:
+                       /*
+                        * If down, but running requested (usually result
+                        * of link up, then we need to start up.
+                        * This can happen when hw down is requested while
+                        * bringing the link up with traffic active on
+                        * 7220, e.g.
+                        */
+                       ss->go_s99_running = 1;
+                       /* fall through and start dma engine */
+               case sdma_event_e10_go_hw_start:
+                       /* This reference means the state machine is started */
+                       sdma_get(&sde->state);
+                       sdma_set_state(sde,
+                                      sdma_state_s10_hw_start_up_halt_wait);
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s10_hw_start_up_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde,
+                                      sdma_state_s15_hw_start_up_clean_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s15_hw_start_up_clean_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s20_idle:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       sdma_set_state(sde, sdma_state_s99_running);
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e85_link_down:
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s30_sw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s40_hw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s50_hw_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s60_idle_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s80_hw_freeze:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s82_freeze_sw_clean:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       /* notify caller this engine is done cleaning */
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s99_running:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       need_progress = 1;
+                       sdma_err_progress_check_schedule(sde);
+               case sdma_event_e90_sw_halted:
+                       /*
+                       * SW initiated halt does not perform engines
+                       * progress check
+                       */
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       schedule_work(&sde->err_halt_worker);
+                       break;
+               case sdma_event_e70_go_idle:
+                       sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               }
+               break;
+       }
+
+       ss->last_event = event;
+       if (need_progress)
+               sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step. The last descriptor
+ * is reserved for coalesce buffer in order to support
+ * cases where input packet has >MAX_DESC iovecs.
+ *
+ */
+static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int i;
+
+       /* Handle last descriptor */
+       if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
+               /* if tlen is 0, it is for padding, release last descriptor */
+               if (!tx->tlen) {
+                       tx->desc_limit = MAX_DESC;
+               } else if (!tx->coalesce_buf) {
+                       /* allocate coalesce buffer with space for padding */
+                       tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
+                                                  GFP_ATOMIC);
+                       if (!tx->coalesce_buf)
+                               goto enomem;
+                       tx->coalesce_idx = 0;
+               }
+               return 0;
+       }
+
+       if (unlikely(tx->num_desc == MAX_DESC))
+               goto enomem;
+
+       tx->descp = kmalloc_array(
+                       MAX_DESC,
+                       sizeof(struct sdma_desc),
+                       GFP_ATOMIC);
+       if (!tx->descp)
+               goto enomem;
+
+       /* reserve last descriptor for coalescing */
+       tx->desc_limit = MAX_DESC - 1;
+       /* copy ones already built */
+       for (i = 0; i < tx->num_desc; i++)
+               tx->descp[i] = tx->descs[i];
+       return 0;
+enomem:
+       sdma_txclean(dd, tx);
+       return -ENOMEM;
+}
+
+/*
+ * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
+ *
+ * This is called once the initial nominal allocation of descriptors
+ * in the sdma_txreq is exhausted.
+ *
+ * This function calls _extend_sdma_tx_descs to extend or allocate
+ * coalesce buffer. If there is a allocated coalesce buffer, it will
+ * copy the input packet data into the coalesce buffer. It also adds
+ * coalesce buffer descriptor once when whole packet is received.
+ *
+ * Return:
+ * <0 - error
+ * 0 - coalescing, don't populate descriptor
+ * 1 - continue with populating descriptor
+ */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+                          int type, void *kvaddr, struct page *page,
+                          unsigned long offset, u16 len)
+{
+       int pad_len, rval;
+       dma_addr_t addr;
+
+       rval = _extend_sdma_tx_descs(dd, tx);
+       if (rval) {
+               sdma_txclean(dd, tx);
+               return rval;
+       }
+
+       /* If coalesce buffer is allocated, copy data into it */
+       if (tx->coalesce_buf) {
+               if (type == SDMA_MAP_NONE) {
+                       sdma_txclean(dd, tx);
+                       return -EINVAL;
+               }
+
+               if (type == SDMA_MAP_PAGE) {
+                       kvaddr = kmap(page);
+                       kvaddr += offset;
+               } else if (WARN_ON(!kvaddr)) {
+                       sdma_txclean(dd, tx);
+                       return -EINVAL;
+               }
+
+               memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
+               tx->coalesce_idx += len;
+               if (type == SDMA_MAP_PAGE)
+                       kunmap(page);
+
+               /* If there is more data, return */
+               if (tx->tlen - tx->coalesce_idx)
+                       return 0;
+
+               /* Whole packet is received; add any padding */
+               pad_len = tx->packet_len & (sizeof(u32) - 1);
+               if (pad_len) {
+                       pad_len = sizeof(u32) - pad_len;
+                       memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
+                       /* padding is taken care of for coalescing case */
+                       tx->packet_len += pad_len;
+                       tx->tlen += pad_len;
+               }
+
+               /* dma map the coalesce buffer */
+               addr = dma_map_single(&dd->pcidev->dev,
+                                     tx->coalesce_buf,
+                                     tx->tlen,
+                                     DMA_TO_DEVICE);
+
+               if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+                       sdma_txclean(dd, tx);
+                       return -ENOSPC;
+               }
+
+               /* Add descriptor for coalesce buffer */
+               tx->desc_limit = MAX_DESC;
+               return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
+                                        addr, tx->tlen);
+       }
+
+       return 1;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+       struct sdma_engine *sde;
+       int i;
+       u64 sreg;
+
+       sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+               SD(CHECK_SLID_MASK_SHIFT)) |
+               (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+               SD(CHECK_SLID_VALUE_SHIFT));
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               sde = &dd->per_sdma[i];
+               write_sde_csr(sde, SD(CHECK_SLID), sreg);
+       }
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int rval = 0;
+
+       tx->num_desc++;
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = _extend_sdma_tx_descs(dd, tx);
+               if (rval) {
+                       sdma_txclean(dd, tx);
+                       return rval;
+               }
+       }
+       /* finish the one just added */
+       make_tx_sdma_desc(
+               tx,
+               SDMA_MAP_NONE,
+               dd->sdma_pad_phys,
+               sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+       _sdma_close_tx(dd, tx);
+       return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen)
+{
+       u32 i, shift = 0, desc = 0;
+       u8 mode;
+
+       WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+       /* compute mode */
+       if (num_ahg == 1)
+               mode = SDMA_AHG_APPLY_UPDATE1;
+       else if (num_ahg <= 5)
+               mode = SDMA_AHG_APPLY_UPDATE2;
+       else
+               mode = SDMA_AHG_APPLY_UPDATE3;
+       tx->num_desc++;
+       /* initialize to consumed descriptors to zero */
+       switch (mode) {
+       case SDMA_AHG_APPLY_UPDATE3:
+               tx->num_desc++;
+               tx->descs[2].qw[0] = 0;
+               tx->descs[2].qw[1] = 0;
+               /* FALLTHROUGH */
+       case SDMA_AHG_APPLY_UPDATE2:
+               tx->num_desc++;
+               tx->descs[1].qw[0] = 0;
+               tx->descs[1].qw[1] = 0;
+               break;
+       }
+       ahg_hlen >>= 2;
+       tx->descs[0].qw[1] |=
+               (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                       << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+               (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+                       << SDMA_DESC1_HEADER_DWS_SHIFT) |
+               (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+                       << SDMA_DESC1_HEADER_MODE_SHIFT) |
+               (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+                       << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+       for (i = 0; i < (num_ahg - 1); i++) {
+               if (!shift && !(i & 2))
+                       desc++;
+               tx->descs[desc].qw[!!(i & 2)] |=
+                       (((u64)ahg[i + 1])
+                               << shift);
+               shift = (shift + 32) & 63;
+       }
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+       int nr;
+       int oldbit;
+
+       if (!sde) {
+               trace_hfi1_ahg_allocate(sde, -EINVAL);
+               return -EINVAL;
+       }
+       while (1) {
+               nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+               if (nr > 31) {
+                       trace_hfi1_ahg_allocate(sde, -ENOSPC);
+                       return -ENOSPC;
+               }
+               oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+               if (!oldbit)
+                       break;
+               cpu_relax();
+       }
+       trace_hfi1_ahg_allocate(sde, nr);
+       return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+       if (!sde)
+               return;
+       trace_hfi1_ahg_deallocate(sde, ahg_index);
+       if (ahg_index < 0 || ahg_index > 31)
+               return;
+       clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled.  Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+       int i;
+       enum sdma_events event = link_down ? sdma_event_e85_link_down :
+                                            sdma_event_e80_hw_freeze;
+
+       /* set up the wait but do not wait here */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines to stop running and wait */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], event);
+
+       /* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+       int ret;
+
+       /*
+        * Make sure all engines have moved out of the running state before
+        * continuing.
+        */
+       ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+                                      atomic_read(&dd->sdma_unfreeze_count) <=
+                                      0);
+       /* interrupted or count is negative, then unloading - just exit */
+       if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+               return;
+
+       /* set up the count for the next wait */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines that the SPC is frozen, they can start cleaning */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+       /*
+        * Wait for everyone to finish software clean before exiting.  The
+        * software clean will read engine CSRs, so must be completed before
+        * the next step, which will clear the engine CSRs.
+        */
+       (void)wait_event_interruptible(dd->sdma_unfreeze_wq,
+                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
+       /* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
+ * that is left is a software clean.  We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* tell all engines start freeze clean up */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i],
+                                  sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+       /* assume we have selected a good cpu */
+       write_csr(sde->dd,
+                 CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
+                 sde->progress_mask);
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
new file mode 100644 (file)
index 0000000..8f50c99
--- /dev/null
@@ -0,0 +1,1082 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+#include "sdma_txreq.h"
+
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
+#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+       ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+       (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+       ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+       (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+       (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+       (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+       (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+       ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+       (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+       ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+       (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
+
+enum sdma_states {
+       sdma_state_s00_hw_down,
+       sdma_state_s10_hw_start_up_halt_wait,
+       sdma_state_s15_hw_start_up_clean_wait,
+       sdma_state_s20_idle,
+       sdma_state_s30_sw_clean_up_wait,
+       sdma_state_s40_hw_clean_up_wait,
+       sdma_state_s50_hw_halt_wait,
+       sdma_state_s60_idle_halt_wait,
+       sdma_state_s80_hw_freeze,
+       sdma_state_s82_freeze_sw_clean,
+       sdma_state_s99_running,
+};
+
+enum sdma_events {
+       sdma_event_e00_go_hw_down,
+       sdma_event_e10_go_hw_start,
+       sdma_event_e15_hw_halt_done,
+       sdma_event_e25_hw_clean_up_done,
+       sdma_event_e30_go_running,
+       sdma_event_e40_sw_cleaned,
+       sdma_event_e50_hw_cleaned,
+       sdma_event_e60_hw_halted,
+       sdma_event_e70_go_idle,
+       sdma_event_e80_hw_freeze,
+       sdma_event_e81_hw_frozen,
+       sdma_event_e82_hw_unfreeze,
+       sdma_event_e85_link_down,
+       sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+       unsigned op_enable:1;
+       unsigned op_intenable:1;
+       unsigned op_halt:1;
+       unsigned op_cleanup:1;
+       unsigned go_s99_running_tofalse:1;
+       unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+       struct kref          kref;
+       struct completion    comp;
+       enum sdma_states current_state;
+       unsigned             current_op;
+       unsigned             go_s99_running;
+       /* debugging/development */
+       enum sdma_states previous_state;
+       unsigned             previous_op;
+       enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+       /* private:  don't use directly */
+       __le64 qw[2];
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+       /* read mostly */
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       /* private: */
+       void __iomem *tail_csr;
+       u64 imask;                      /* clear interrupt mask */
+       u64 idle_mask;
+       u64 progress_mask;
+       u64 int_mask;
+       /* private: */
+       volatile __le64      *head_dma; /* DMA'ed by chip */
+       /* private: */
+       dma_addr_t            head_phys;
+       /* private: */
+       struct hw_sdma_desc *descq;
+       /* private: */
+       unsigned descq_full_count;
+       struct sdma_txreq **tx_ring;
+       /* private: */
+       dma_addr_t            descq_phys;
+       /* private */
+       u32 sdma_mask;
+       /* private */
+       struct sdma_state state;
+       /* private */
+       int cpu;
+       /* private: */
+       u8 sdma_shift;
+       /* private: */
+       u8 this_idx; /* zero relative engine */
+       /* protect changes to senddmactrl shadow */
+       spinlock_t senddmactrl_lock;
+       /* private: */
+       u64 p_senddmactrl;              /* shadow per-engine SendDmaCtrl */
+
+       /* read/write using tail_lock */
+       spinlock_t            tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   tail_sn;
+#endif
+       /* private: */
+       u32                   descq_tail;
+       /* private: */
+       unsigned long         ahg_bits;
+       /* private: */
+       u16                   desc_avail;
+       /* private: */
+       u16                   tx_tail;
+       /* private: */
+       u16 descq_cnt;
+
+       /* read/write using head_lock */
+       /* private: */
+       seqlock_t            head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   head_sn;
+#endif
+       /* private: */
+       u32                   descq_head;
+       /* private: */
+       u16                   tx_head;
+       /* private: */
+       u64                   last_status;
+       /* private */
+       u64                     err_cnt;
+       /* private */
+       u64                     sdma_int_cnt;
+       u64                     idle_int_cnt;
+       u64                     progress_int_cnt;
+
+       /* private: */
+       struct list_head      dmawait;
+
+       /* CONFIG SDMA for now, just blindly duplicate */
+       /* private: */
+       struct tasklet_struct sdma_hw_clean_up_task
+               ____cacheline_aligned_in_smp;
+
+       /* private: */
+       struct tasklet_struct sdma_sw_clean_up_task
+               ____cacheline_aligned_in_smp;
+       /* private: */
+       struct work_struct err_halt_worker;
+       /* private */
+       struct timer_list     err_progress_check_timer;
+       u32                   progress_check_head;
+       /* private: */
+       struct work_struct flush_worker;
+       /* protect flush list */
+       spinlock_t flushlist_lock;
+       /* private: */
+       struct list_head flushlist;
+};
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+       return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+       return sde->descq_cnt -
+               (sde->descq_tail -
+                ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+       return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+       return engine->state.current_state == sdma_state_s99_running;
+}
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&engine->tail_lock, flags);
+       ret = __sdma_running(engine);
+       spin_unlock_irqrestore(&engine->tail_lock, flags);
+       return ret;
+}
+
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen);
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       u8 ahg_entry,
+       u8 num_ahg,
+       u32 *ahg,
+       u8 ahg_hlen,
+       void (*cb)(struct sdma_txreq *, int))
+{
+       if (tlen == 0)
+               return -ENODATA;
+       if (tlen > MAX_SDMA_PKT_SIZE)
+               return -EMSGSIZE;
+       tx->desc_limit = ARRAY_SIZE(tx->descs);
+       tx->descp = &tx->descs[0];
+       INIT_LIST_HEAD(&tx->list);
+       tx->num_desc = 0;
+       tx->flags = flags;
+       tx->complete = cb;
+       tx->coalesce_buf = NULL;
+       tx->wait = NULL;
+       tx->packet_len = tlen;
+       tx->tlen = tx->packet_len;
+       tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+       tx->descs[0].qw[1] = 0;
+       if (flags & SDMA_TXREQ_F_AHG_COPY)
+               tx->descs[0].qw[1] |=
+                       (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                               << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+                       (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+                               << SDMA_DESC1_HEADER_MODE_SHIFT);
+       else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+               _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+       return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       void (*cb)(struct sdma_txreq *, int))
+{
+       return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+       return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+               >> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+               >> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+               >> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+       struct sdma_txreq *tx,
+       int type,
+       dma_addr_t addr,
+       size_t len)
+{
+       struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+       if (!tx->num_desc) {
+               /* qw[0] zero; qw[1] first, ahg mode already in from init */
+               desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       } else {
+               desc->qw[0] = 0;
+               desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       }
+       desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+                               << SDMA_DESC0_PHY_ADDR_SHIFT) |
+                       (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+                               << SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+                          int type, void *kvaddr, struct page *page,
+                          unsigned long offset, u16 len);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+                                 struct sdma_txreq *tx)
+{
+       tx->descp[tx->num_desc].qw[0] |=
+               SDMA_DESC0_LAST_DESC_FLAG;
+       tx->descp[tx->num_desc].qw[1] |=
+               dd->default_desc1;
+       if (tx->flags & SDMA_TXREQ_F_URGENT)
+               tx->descp[tx->num_desc].qw[1] |=
+                       (SDMA_DESC1_HEAD_TO_HOST_FLAG |
+                        SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       int type,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       int rval = 0;
+
+       make_tx_sdma_desc(
+               tx,
+               type,
+               addr, len);
+       WARN_ON(len > tx->tlen);
+       tx->tlen -= len;
+       /* special cases for last */
+       if (!tx->tlen) {
+               if (tx->packet_len & (sizeof(u32) - 1)) {
+                       rval = _pad_sdma_tx_descs(dd, tx);
+                       if (rval)
+                               return rval;
+               } else {
+                       _sdma_close_tx(dd, tx);
+               }
+       }
+       tx->num_desc++;
+       return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend/coalesce descriptor array
+ */
+static inline int sdma_txadd_page(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       struct page *page,
+       unsigned long offset,
+       u16 len)
+{
+       dma_addr_t addr;
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
+                                             NULL, page, offset, len);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       addr = dma_map_page(
+                      &dd->pcidev->dev,
+                      page,
+                      offset,
+                      len,
+                      DMA_TO_DEVICE);
+
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
+                                             NULL, NULL, 0, 0);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       void *kvaddr,
+       u16 len)
+{
+       dma_addr_t addr;
+       int rval;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
+                                             kvaddr, NULL, 0, len);
+               if (rval <= 0)
+                       return rval;
+       }
+
+       addr = dma_map_single(
+                      &dd->pcidev->dev,
+                      kvaddr,
+                      len,
+                      DMA_TO_DEVICE);
+
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+                    struct iowait *wait,
+                    struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+       u16 data,
+       u8 dwindex,
+       u8 startbit,
+       u8 bits)
+{
+       return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+               ((startbit & SDMA_AHG_FIELD_START_MASK) <<
+               SDMA_AHG_FIELD_START_SHIFT) |
+               ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+               SDMA_AHG_FIELD_LEN_SHIFT) |
+               ((dwindex & SDMA_AHG_INDEX_MASK) <<
+               SDMA_AHG_INDEX_SHIFT) |
+               ((data & SDMA_AHG_VALUE_MASK) <<
+               SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+                                    struct sdma_txreq *tx)
+{
+       if (read_seqretry(&sde->head_lock, seq)) {
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               if (tx->num_desc > sde->desc_avail)
+                       return 0;
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+       struct sdma_engine *sde,
+       struct iowait *wait)
+{
+       struct hfi1_pportdata *ppd = sde->dd->pport;
+
+       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n] -> eng n    |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n  |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n  |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | sde[m] -> eng m+n  |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n|
+ *                                                  \- |--------------------|
+ *                                                    >| sde[1] -> eng 2+m+n|
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | sde[o] -> eng o+m+n|
+ *                                                     +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+       u32 mask;
+       struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @engine_to_vl - map of an engine to a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+       s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
+       struct rcu_head list;
+       u32 mask;
+       u8 actual_vls;
+       u8 vls;
+       struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+       struct hfi1_devdata *dd,
+       u8 port,
+       u8 num_vls,
+       u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+               return;
+       _sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+       char *r = s;
+
+       while (*s)
+               if (*s++ == '/')
+                       r = s;
+       return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
new file mode 100644 (file)
index 0000000..bf7d777
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_SDMA_TXREQ_H
+#define HFI1_SDMA_TXREQ_H
+
+/* increased for AHG */
+#define NUM_DESC 6
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+       /* private:  don't use directly */
+       u64 qw[2];
+};
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int);
+
+struct iowait;
+struct sdma_txreq {
+       struct list_head list;
+       /* private: */
+       struct sdma_desc *descp;
+       /* private: */
+       void *coalesce_buf;
+       /* private: */
+       struct iowait *wait;
+       /* private: */
+       callback_t                  complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       u64 sn;
+#endif
+       /* private: - used in coalesce/pad processing */
+       u16                         packet_len;
+       /* private: - down-counted to trigger last */
+       u16                         tlen;
+       /* private: */
+       u16                         num_desc;
+       /* private: */
+       u16                         desc_limit;
+       /* private: */
+       u16                         next_descq_idx;
+       /* private: */
+       u16 coalesce_idx;
+       /* private: flags */
+       u16                         flags;
+       /* private: */
+       struct sdma_desc descs[NUM_DESC];
+};
+
+static inline int sdma_txreq_built(struct sdma_txreq *tx)
+{
+       return tx->num_desc;
+}
+
+#endif                          /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
new file mode 100644 (file)
index 0000000..91fc2ae
--- /dev/null
@@ -0,0 +1,785 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+                                struct bin_attribute *bin_attr,
+                                char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+                + sizeof(__be16);
+
+       if (pos > ret)
+               return -EINVAL;
+
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (!cc_state) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, (void *)&cc_state->cct + pos, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+       /* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct bin_attribute cc_table_bin_attr = {
+       .attr = {.name = "cc_table_bin", .mode = 0444},
+       .read = read_cc_table_bin,
+       .size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+                                  struct bin_attribute *bin_attr,
+                                  char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+       if (pos > ret)
+               return -EINVAL;
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (!cc_state) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+       .attr = {.name = "cc_settings_bin", .mode = 0444},
+       .read = read_cc_setting_bin,
+       .size = PAGE_SIZE,
+};
+
+struct hfi1_port_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct hfi1_pportdata *, char *);
+       ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t);
+};
+
+static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
+{
+       return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
+}
+
+static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
+                               size_t count)
+{
+       if (!memcmp(buf, "on", 2))
+               ppd->cc_prescan = true;
+       else if (!memcmp(buf, "off", 3))
+               ppd->cc_prescan = false;
+
+       return count;
+}
+
+static struct hfi1_port_attr cc_prescan_attr =
+               __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
+
+static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
+                           char *buf)
+{
+       struct hfi1_port_attr *port_attr =
+               container_of(attr, struct hfi1_port_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+       return port_attr->show(ppd, buf);
+}
+
+static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct hfi1_port_attr *port_attr =
+               container_of(attr, struct hfi1_port_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+       return port_attr->store(ppd, buf, count);
+}
+
+static const struct sysfs_ops port_cc_sysfs_ops = {
+       .show = cc_attr_show,
+       .store = cc_attr_store
+};
+
+static struct attribute *port_cc_default_attributes[] = {
+       &cc_prescan_attr.attr
+};
+
+static struct kobj_type port_cc_ktype = {
+       .release = port_release,
+       .sysfs_ops = &port_cc_sysfs_ops,
+       .default_attrs = port_cc_default_attributes
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N)                                 \
+       static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sc = N \
+       }
+
+struct hfi1_sc2vl_attr {
+       struct attribute attr;
+       int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+static struct attribute *sc2vl_default_attributes[] = {
+       &hfi1_sc2vl_attr_0.attr,
+       &hfi1_sc2vl_attr_1.attr,
+       &hfi1_sc2vl_attr_2.attr,
+       &hfi1_sc2vl_attr_3.attr,
+       &hfi1_sc2vl_attr_4.attr,
+       &hfi1_sc2vl_attr_5.attr,
+       &hfi1_sc2vl_attr_6.attr,
+       &hfi1_sc2vl_attr_7.attr,
+       &hfi1_sc2vl_attr_8.attr,
+       &hfi1_sc2vl_attr_9.attr,
+       &hfi1_sc2vl_attr_10.attr,
+       &hfi1_sc2vl_attr_11.attr,
+       &hfi1_sc2vl_attr_12.attr,
+       &hfi1_sc2vl_attr_13.attr,
+       &hfi1_sc2vl_attr_14.attr,
+       &hfi1_sc2vl_attr_15.attr,
+       &hfi1_sc2vl_attr_16.attr,
+       &hfi1_sc2vl_attr_17.attr,
+       &hfi1_sc2vl_attr_18.attr,
+       &hfi1_sc2vl_attr_19.attr,
+       &hfi1_sc2vl_attr_20.attr,
+       &hfi1_sc2vl_attr_21.attr,
+       &hfi1_sc2vl_attr_22.attr,
+       &hfi1_sc2vl_attr_23.attr,
+       &hfi1_sc2vl_attr_24.attr,
+       &hfi1_sc2vl_attr_25.attr,
+       &hfi1_sc2vl_attr_26.attr,
+       &hfi1_sc2vl_attr_27.attr,
+       &hfi1_sc2vl_attr_28.attr,
+       &hfi1_sc2vl_attr_29.attr,
+       &hfi1_sc2vl_attr_30.attr,
+       &hfi1_sc2vl_attr_31.attr,
+       NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sc2vl_attr *sattr =
+               container_of(attr, struct hfi1_sc2vl_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+       .show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sc2vl_ops,
+       .default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N)                                 \
+       static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {     \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sl = N                                           \
+       }
+
+struct hfi1_sl2sc_attr {
+       struct attribute attr;
+       int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+static struct attribute *sl2sc_default_attributes[] = {
+       &hfi1_sl2sc_attr_0.attr,
+       &hfi1_sl2sc_attr_1.attr,
+       &hfi1_sl2sc_attr_2.attr,
+       &hfi1_sl2sc_attr_3.attr,
+       &hfi1_sl2sc_attr_4.attr,
+       &hfi1_sl2sc_attr_5.attr,
+       &hfi1_sl2sc_attr_6.attr,
+       &hfi1_sl2sc_attr_7.attr,
+       &hfi1_sl2sc_attr_8.attr,
+       &hfi1_sl2sc_attr_9.attr,
+       &hfi1_sl2sc_attr_10.attr,
+       &hfi1_sl2sc_attr_11.attr,
+       &hfi1_sl2sc_attr_12.attr,
+       &hfi1_sl2sc_attr_13.attr,
+       &hfi1_sl2sc_attr_14.attr,
+       &hfi1_sl2sc_attr_15.attr,
+       &hfi1_sl2sc_attr_16.attr,
+       &hfi1_sl2sc_attr_17.attr,
+       &hfi1_sl2sc_attr_18.attr,
+       &hfi1_sl2sc_attr_19.attr,
+       &hfi1_sl2sc_attr_20.attr,
+       &hfi1_sl2sc_attr_21.attr,
+       &hfi1_sl2sc_attr_22.attr,
+       &hfi1_sl2sc_attr_23.attr,
+       &hfi1_sl2sc_attr_24.attr,
+       &hfi1_sl2sc_attr_25.attr,
+       &hfi1_sl2sc_attr_26.attr,
+       &hfi1_sl2sc_attr_27.attr,
+       &hfi1_sl2sc_attr_28.attr,
+       &hfi1_sl2sc_attr_29.attr,
+       &hfi1_sl2sc_attr_30.attr,
+       &hfi1_sl2sc_attr_31.attr,
+       NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sl2sc_attr *sattr =
+               container_of(attr, struct hfi1_sl2sc_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+       return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+       .show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sl2sc_ops,
+       .default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+       static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .vl = N                                           \
+       }
+
+struct hfi1_vl2mtu_attr {
+       struct attribute attr;
+       int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+       &hfi1_vl2mtu_attr_0.attr,
+       &hfi1_vl2mtu_attr_1.attr,
+       &hfi1_vl2mtu_attr_2.attr,
+       &hfi1_vl2mtu_attr_3.attr,
+       &hfi1_vl2mtu_attr_4.attr,
+       &hfi1_vl2mtu_attr_5.attr,
+       &hfi1_vl2mtu_attr_6.attr,
+       &hfi1_vl2mtu_attr_7.attr,
+       &hfi1_vl2mtu_attr_8.attr,
+       &hfi1_vl2mtu_attr_9.attr,
+       &hfi1_vl2mtu_attr_10.attr,
+       &hfi1_vl2mtu_attr_11.attr,
+       &hfi1_vl2mtu_attr_12.attr,
+       &hfi1_vl2mtu_attr_13.attr,
+       &hfi1_vl2mtu_attr_14.attr,
+       &hfi1_vl2mtu_attr_15.attr,
+       NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf)
+{
+       struct hfi1_vl2mtu_attr *vlattr =
+               container_of(attr, struct hfi1_vl2mtu_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+       .show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_vl2mtu_ops,
+       .default_attrs = vl2mtu_default_attributes
+};
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+
+       return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (!dd->boardname)
+               ret = -EINVAL;
+       else
+               ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+       return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+                                struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+static ssize_t show_nctxts(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /*
+        * Return the smaller of send and receive contexts.
+        * Normally, user level applications would require both a send
+        * and a receive context, so returning the smaller of the two counts
+        * give a more accurate picture of total contexts available.
+        */
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        min(dd->num_rcv_contexts - dd->first_user_ctxt,
+                            (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+                              struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* Return the number of free user ports (contexts) available. */
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = hfi1_reset_device(dd->unit);
+bail:
+       return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx)                                 \
+       scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",            \
+                             ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+                             struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       struct hfi1_temp temp;
+       int ret;
+
+       ret = hfi1_tempsense_rd(dd, &temp);
+       if (!ret) {
+               int idx = 0;
+
+               idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+               idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+                               "%u %u %u\n", temp.triggers & 0x1,
+                               temp.triggers & 0x2, temp.triggers & 0x4);
+               ret = idx;
+       }
+       return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+       &dev_attr_hw_rev,
+       &dev_attr_board_id,
+       &dev_attr_nctxts,
+       &dev_attr_nfreectxts,
+       &dev_attr_serial,
+       &dev_attr_boardversion,
+       &dev_attr_tempsense,
+       &dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       int ret;
+
+       if (!port_num || port_num > dd->num_pports) {
+               dd_dev_err(dd,
+                          "Skipping infiniband class with invalid port %u\n",
+                          port_num);
+               return -ENODEV;
+       }
+       ppd = &dd->pport[port_num - 1];
+
+       ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+                                  "sc2vl");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sc2vl sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail;
+       }
+       kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+                                  "sl2sc");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sl2sc sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sc2vl;
+       }
+       kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+                                  "vl2mtu");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sl2sc;
+       }
+       kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+                                  kobj, "CCMgtA");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_vl2mtu;
+       }
+
+       kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_cc;
+       }
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_cc_entry_bin;
+       }
+
+       dd_dev_info(dd,
+                   "Congestion Control Agent enabled for port %d\n",
+                   port_num);
+
+       return 0;
+
+bail_cc_entry_bin:
+       sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                             &cc_setting_bin_attr);
+bail_cc:
+       kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+       kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+       kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+       kobject_put(&ppd->sc2vl_kobj);
+bail:
+       return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+       struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
+       int i, ret;
+
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+               ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+               if (ret)
+                       goto bail;
+       }
+
+       return 0;
+bail:
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+               device_remove_file(&dev->dev, hfi1_attributes[i]);
+       return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       for (i = 0; i < dd->num_pports; i++) {
+               ppd = &dd->pport[i];
+
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_setting_bin_attr);
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_table_bin_attr);
+               kobject_put(&ppd->pport_cc_kobj);
+               kobject_put(&ppd->vl2mtu_kobj);
+               kobject_put(&ppd->sl2sc_kobj);
+               kobject_put(&ppd->sc2vl_kobj);
+       }
+}
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
new file mode 100644 (file)
index 0000000..79b2952
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       u8 opcode;
+       u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+       if (lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       return hdr_len_by_opcode[opcode] == 0 ?
+              0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN  "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define IETH_PRN "ieth rkey 0x%.8x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+       return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+static const char *parse_syndrome(u8 syndrome)
+{
+       switch (syndrome >> 5) {
+       case 0:
+               return "ACK";
+       case 1:
+               return "RNRNAK";
+       case 3:
+               return "NAK";
+       }
+       return "";
+}
+
+const char *parse_everbs_hdrs(
+       struct trace_seq *p,
+       u8 opcode,
+       void *ehdrs)
+{
+       union ib_ehdrs *eh = ehdrs;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       switch (opcode) {
+       /* imm */
+       case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               trace_seq_printf(p, IMM_PRN,
+                                be32_to_cpu(eh->imm_data));
+               break;
+       /* reth + imm */
+       case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->rc.reth.vaddr),
+                                be32_to_cpu(eh->rc.reth.rkey),
+                                be32_to_cpu(eh->rc.reth.length),
+                                be32_to_cpu(eh->rc.imm_data));
+               break;
+       /* reth */
+       case OP(RC, RDMA_READ_REQUEST):
+       case OP(RC, RDMA_WRITE_FIRST):
+       case OP(UC, RDMA_WRITE_FIRST):
+       case OP(RC, RDMA_WRITE_ONLY):
+       case OP(UC, RDMA_WRITE_ONLY):
+               trace_seq_printf(p, RETH_PRN,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->rc.reth.vaddr),
+                                be32_to_cpu(eh->rc.reth.rkey),
+                                be32_to_cpu(eh->rc.reth.length));
+               break;
+       case OP(RC, RDMA_READ_RESPONSE_FIRST):
+       case OP(RC, RDMA_READ_RESPONSE_LAST):
+       case OP(RC, RDMA_READ_RESPONSE_ONLY):
+       case OP(RC, ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
+                                parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
+                                be32_to_cpu(eh->aeth) & HFI1_MSN_MASK);
+               break;
+       /* aeth + atomicacketh */
+       case OP(RC, ATOMIC_ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+                                be32_to_cpu(eh->at.aeth) >> 24,
+                                parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
+                                be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK,
+                                (unsigned long long)
+                                ib_u64_get(eh->at.atomic_ack_eth));
+               break;
+       /* atomiceth */
+       case OP(RC, COMPARE_SWAP):
+       case OP(RC, FETCH_ADD):
+               trace_seq_printf(p, ATOMICETH_PRN,
+                                (unsigned long long)ib_u64_get(
+                                eh->atomic_eth.vaddr),
+                                eh->atomic_eth.rkey,
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->atomic_eth.swap_data),
+                                (unsigned long long)ib_u64_get(
+                                (__be32 *)&eh->atomic_eth.compare_data));
+               break;
+       /* deth */
+       case OP(UD, SEND_ONLY):
+       case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, DETH_PRN,
+                                be32_to_cpu(eh->ud.deth[0]),
+                                be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
+               break;
+       /* ieth */
+       case OP(RC, SEND_LAST_WITH_INVALIDATE):
+       case OP(RC, SEND_ONLY_WITH_INVALIDATE):
+               trace_seq_printf(p, IETH_PRN,
+                                be32_to_cpu(eh->ieth));
+               break;
+       }
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *parse_sdma_flags(
+       struct trace_seq *p,
+       u64 desc0, u64 desc1)
+{
+       const char *ret = trace_seq_buffer_ptr(p);
+       char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+       flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+       flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
+       flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+       flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+       trace_seq_printf(p, "%s", flags);
+       if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+               trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
+                                     SDMA_DESC1_HEADER_MODE_MASK),
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
+                                     SDMA_DESC1_HEADER_INDEX_MASK),
+                                (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
+                                     SDMA_DESC1_HEADER_DWS_MASK));
+       return ret;
+}
+
+const char *print_u32_array(
+       struct trace_seq *p,
+       u32 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len ; i++)
+               trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *print_u64_array(
+       struct trace_seq *p,
+       u64 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len; i++)
+               trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
+__hfi1_trace_fn(MMU);
+__hfi1_trace_fn(IOCTL);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
new file mode 100644 (file)
index 0000000..28c1d08
--- /dev/null
@@ -0,0 +1,1372 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR hfi1
+
+#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+       packettype_name(EXPECTED),              \
+       packettype_name(EAGER),                 \
+       packettype_name(IB),                    \
+       packettype_name(ERROR),                 \
+       packettype_name(BYPASS))
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    u32 ctxt,
+                    u64 eflags,
+                    u32 etype,
+                    u32 hlen,
+                    u32 tlen,
+                    u32 updegr,
+                    u32 etail
+                    ),
+           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u64, eflags)
+                            __field(u32, ctxt)
+                            __field(u32, etype)
+                            __field(u32, hlen)
+                            __field(u32, tlen)
+                            __field(u32, updegr)
+                            __field(u32, etail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->eflags = eflags;
+                          __entry->ctxt = ctxt;
+                          __entry->etype = etype;
+                          __entry->hlen = hlen;
+                          __entry->tlen = tlen;
+                          __entry->updegr = updegr;
+                          __entry->etail = etail;
+                          ),
+           TP_printk(
+                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->eflags,
+                     __entry->etype, show_packettype(__entry->etype),
+                     __entry->hlen,
+                     __entry->tlen,
+                     __entry->updegr,
+                     __entry->etail
+                     )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+           TP_ARGS(dd, ctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, ctxt)
+                            __field(u8, slow_path)
+                            __field(u8, dma_rtail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = ctxt;
+                          if (dd->rcd[ctxt]->do_interrupt ==
+                              &handle_receive_interrupt) {
+                               __entry->slow_path = 1;
+                               __entry->dma_rtail = 0xFF;
+                          } else if (dd->rcd[ctxt]->do_interrupt ==
+                                     &handle_receive_interrupt_dma_rtail){
+                               __entry->dma_rtail = 1;
+                               __entry->slow_path = 0;
+                          } else if (dd->rcd[ctxt]->do_interrupt ==
+                                     &handle_receive_interrupt_nodma_rtail) {
+                               __entry->dma_rtail = 0;
+                               __entry->slow_path = 0;
+                          }
+                          ),
+           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->slow_path,
+                     __entry->dma_rtail
+                     )
+);
+
+TRACE_EVENT(hfi1_exp_tid_reg,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
+                    u32 npages, unsigned long va, unsigned long pa,
+                    dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(unsigned long, va)
+                   __field(unsigned long, pa)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->va = va;
+                   __entry->pa = pa;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_unreg,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
+                    unsigned long va, unsigned long pa, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(unsigned long, va)
+                   __field(unsigned long, pa)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->va = va;
+                   __entry->pa = pa;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
+                    u32 npages, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(unsigned long, va)
+                   __field(u32, rarr)
+                   __field(u32, npages)
+                   __field(dma_addr_t, dma)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->va = va;
+                   __entry->rarr = rarr;
+                   __entry->npages = npages;
+                   __entry->dma = dma;
+                   ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->va,
+                     __entry->dma
+                   )
+       );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
+                    unsigned long start, unsigned long end),
+           TP_ARGS(ctxt, subctxt, type, start, end),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __string(type, type)
+                   __field(unsigned long, start)
+                   __field(unsigned long, end)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __assign_str(type, type);
+                   __entry->start = start;
+                   __entry->end = end;
+                   ),
+           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __get_str(type),
+                     __entry->start,
+                     __entry->end
+                   )
+       );
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+           TP_PROTO(struct send_context *sc, int extra),
+           TP_ARGS(sc, extra),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                            __field(u32, sw_index)
+                            __field(u32, hw_context)
+                            __field(int, extra)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                          __entry->sw_index = sc->sw_index;
+                          __entry->hw_context = sc->hw_context;
+                          __entry->extra = extra;
+                          ),
+           TP_printk("[%s] ctxt %u(%u) extra %d",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->extra
+                     )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+           TP_ARGS(sc, needint, credit_ctrl),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                            __field(u32, sw_index)
+                            __field(u32, hw_context)
+                            __field(u32, needint)
+                            __field(u64, credit_ctrl)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                          __entry->sw_index = sc->sw_index;
+                          __entry->hw_context = sc->hw_context;
+                          __entry->needint = needint;
+                          __entry->credit_ctrl = credit_ctrl;
+                          ),
+           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->needint,
+                     (unsigned long long)__entry->credit_ctrl
+                      )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 flags),
+                   TP_ARGS(qp, flags),
+                   TP_STRUCT__entry(
+                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                           __field(u32, qpn)
+                           __field(u32, flags)
+                           __field(u32, s_flags)
+                           ),
+                   TP_fast_assign(
+                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                           __entry->flags = flags;
+                           __entry->qpn = qp->ibqp.qp_num;
+                           __entry->s_flags = qp->s_flags;
+                           ),
+                   TP_printk(
+                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+                           __get_str(dev),
+                           __entry->qpn,
+                           __entry->flags,
+                           __entry->s_flags
+                           )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+       lrh_name(LRH_BTH),               \
+       lrh_name(LRH_GRH))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+       ib_opcode_name(RC_SEND_FIRST),                     \
+       ib_opcode_name(RC_SEND_MIDDLE),                    \
+       ib_opcode_name(RC_SEND_LAST),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_SEND_ONLY),                      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+       ib_opcode_name(RC_ACKNOWLEDGE),                    \
+       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+       ib_opcode_name(RC_COMPARE_SWAP),                   \
+       ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
+       ib_opcode_name(UC_SEND_FIRST),                     \
+       ib_opcode_name(UC_SEND_MIDDLE),                    \
+       ib_opcode_name(UC_SEND_LAST),                      \
+       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_SEND_ONLY),                      \
+       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(UD_SEND_ONLY),                      \
+       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(CNP))
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct hfi1_ib_header *hdr),
+                   TP_ARGS(dd, hdr),
+                   TP_STRUCT__entry(
+                           DD_DEV_ENTRY(dd)
+                           /* LRH */
+                           __field(u8, vl)
+                           __field(u8, lver)
+                           __field(u8, sl)
+                           __field(u8, lnh)
+                           __field(u16, dlid)
+                           __field(u16, len)
+                           __field(u16, slid)
+                           /* BTH */
+                           __field(u8, opcode)
+                           __field(u8, se)
+                           __field(u8, m)
+                           __field(u8, pad)
+                           __field(u8, tver)
+                           __field(u16, pkey)
+                           __field(u8, f)
+                           __field(u8, b)
+                           __field(u32, qpn)
+                           __field(u8, a)
+                           __field(u32, psn)
+                           /* extended headers */
+                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+                           ),
+                   TP_fast_assign(
+                          struct hfi1_other_headers *ohdr;
+
+                          DD_DEV_ASSIGN(dd);
+                          /* LRH */
+                          __entry->vl =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+                          __entry->lver =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+                          __entry->sl =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+                          __entry->lnh =
+                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+                          __entry->dlid =
+                          be16_to_cpu(hdr->lrh[1]);
+                          /* allow for larger len */
+                          __entry->len =
+                          be16_to_cpu(hdr->lrh[2]);
+                          __entry->slid =
+                          be16_to_cpu(hdr->lrh[3]);
+                          /* BTH */
+                          if (__entry->lnh == HFI1_LRH_BTH)
+                               ohdr = &hdr->u.oth;
+                          else
+                               ohdr = &hdr->u.l.oth;
+                         __entry->opcode =
+                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+                         __entry->se =
+                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+                         __entry->m =
+                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+                         __entry->pad =
+                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+                         __entry->tver =
+                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+                         __entry->pkey =
+                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
+                         __entry->f =
+                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+                         HFI1_FECN_MASK;
+                         __entry->b =
+                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+                         HFI1_BECN_MASK;
+                         __entry->qpn =
+                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+                         __entry->a =
+                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+                         /* allow for larger PSN */
+                         __entry->psn =
+                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+                         /* extended headers */
+                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+                                ibhdr_exhdr_len(hdr));
+                        ),
+                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+                             __get_str(dev),
+                             /* LRH */
+                             __entry->vl,
+                             __entry->lver,
+                             __entry->sl,
+                             __entry->lnh, show_lnh(__entry->lnh),
+                             __entry->dlid,
+                             __entry->len,
+                             __entry->slid,
+                             /* BTH */
+                             __entry->opcode, show_ib_opcode(__entry->opcode),
+                             __entry->se,
+                             __entry->m,
+                             __entry->pad,
+                             __entry->tver,
+                             __entry->pkey,
+                             __entry->f,
+                             __entry->b,
+                             __entry->qpn,
+                             __entry->a,
+                             __entry->psn,
+                             /* extended headers */
+                             __parse_ib_ehdrs(
+                                       __entry->opcode,
+                                       (void *)__get_dynamic_array(ehdrs))
+                            )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+#define SNOOP_PRN \
+       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_snoop
+
+TRACE_EVENT(snoop_capture,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    int hdr_len,
+                    struct hfi1_ib_header *hdr,
+                    int data_len,
+                    void *data),
+           TP_ARGS(dd, hdr_len, hdr, data_len, data),
+           TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u16, slid)
+               __field(u16, dlid)
+               __field(u32, qpn)
+               __field(u8, opcode)
+               __field(u8, sl)
+               __field(u16, pkey)
+               __field(u32, hdr_len)
+               __field(u32, data_len)
+               __field(u8, lnh)
+               __dynamic_array(u8, raw_hdr, hdr_len)
+               __dynamic_array(u8, raw_pkt, data_len)
+               ),
+           TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else
+                       ohdr = &hdr->u.l.oth;
+               DD_DEV_ASSIGN(dd);
+               __entry->slid = be16_to_cpu(hdr->lrh[3]);
+               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->hdr_len = hdr_len;
+               __entry->data_len = data_len;
+               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+               ),
+           TP_printk(
+               "[%s] " SNOOP_PRN,
+               __get_str(dev),
+               __entry->slid,
+               __entry->dlid,
+               __entry->qpn,
+               __entry->opcode,
+               show_ib_opcode(__entry->opcode),
+               __entry->sl,
+               __entry->pkey,
+               __entry->hdr_len,
+               __entry->data_len
+               )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
+       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+           TP_ARGS(dd, uctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned, ctxt)
+                            __field(u32, credits)
+                            __field(u64, hw_free)
+                            __field(u64, piobase)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u64, rcvhdrq_phys)
+                            __field(u32, eager_cnt)
+                            __field(u64, rcvegr_phys)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = uctxt->ctxt;
+                          __entry->credits = uctxt->sc->credits;
+                          __entry->hw_free = (u64)uctxt->sc->hw_free;
+                          __entry->piobase = (u64)uctxt->sc->base_addr;
+                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+                          __entry->eager_cnt = uctxt->egrbufs.alloced;
+                          __entry->rcvegr_phys =
+                          uctxt->egrbufs.rcvtids[0].phys;
+                          ),
+           TP_printk("[%s] ctxt %u " UCTXT_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->credits,
+                     __entry->hw_free,
+                     __entry->piobase,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_phys,
+                     __entry->eager_cnt,
+                     __entry->rcvegr_phys
+                     )
+);
+
+#define CINFO_FMT \
+       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
+                    struct hfi1_ctxt_info cinfo),
+           TP_ARGS(dd, ctxt, subctxt, cinfo),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned, ctxt)
+                            __field(unsigned, subctxt)
+                            __field(u16, egrtids)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u16, rcvhdrq_size)
+                            __field(u16, sdma_ring_size)
+                            __field(u32, rcvegr_size)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->ctxt = ctxt;
+                           __entry->subctxt = subctxt;
+                           __entry->egrtids = cinfo.egrtids;
+                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
+                           __entry->rcvegr_size = cinfo.rcvegr_size;
+                           ),
+           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->egrtids,
+                     __entry->rcvegr_size,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_size,
+                     __entry->sdma_ring_size
+                     )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sma
+
+#define BCT_FORMAT \
+       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+       be16_to_cpu( \
+               ((struct buffer_control *)__get_dynamic_array(bct))->field \
+       )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct buffer_control *bc),
+                   TP_ARGS(dd, bc),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                                    __dynamic_array(u8, bct, sizeof(*bc))
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(dd);
+                                  memcpy(__get_dynamic_array(bct), bc,
+                                         sizeof(*bc));
+                                  ),
+                   TP_printk(BCT_FORMAT,
+                             BCT(overall_shared_limit),
+
+                             BCT(vl[0].dedicated),
+                             BCT(vl[0].shared),
+
+                             BCT(vl[1].dedicated),
+                             BCT(vl[1].shared),
+
+                             BCT(vl[2].dedicated),
+                             BCT(vl[2].shared),
+
+                             BCT(vl[3].dedicated),
+                             BCT(vl[3].shared),
+
+                             BCT(vl[4].dedicated),
+                             BCT(vl[4].shared),
+
+                             BCT(vl[5].dedicated),
+                             BCT(vl[5].shared),
+
+                             BCT(vl[6].dedicated),
+                             BCT(vl[6].shared),
+
+                             BCT(vl[7].dedicated),
+                             BCT(vl[7].shared),
+
+                             BCT(vl[15].dedicated),
+                             BCT(vl[15].shared)
+                             )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sdma
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+           TP_PROTO(struct sdma_engine *sde,
+                    u64 desc0,
+                    u64 desc1,
+                    u16 e,
+                    void *descp),
+       TP_ARGS(sde, desc0, desc1, e, descp),
+       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                        __field(void *, descp)
+                        __field(u64, desc0)
+                        __field(u64, desc1)
+                        __field(u16, e)
+                        __field(u8, idx)
+                        ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __entry->desc0 = desc0;
+                      __entry->desc1 = desc1;
+                      __entry->idx = sde->this_idx;
+                      __entry->descp = descp;
+                      __entry->e = e;
+                      ),
+       TP_printk(
+                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+                 __get_str(dev),
+                 __entry->idx,
+                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
+                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+                 SDMA_DESC0_PHY_ADDR_MASK,
+                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+                      SDMA_DESC1_GENERATION_MASK),
+                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+                       SDMA_DESC0_BYTE_COUNT_MASK),
+                 __entry->desc0,
+                 __entry->desc1,
+                 __entry->descp,
+                 __entry->e
+                 )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+           TP_ARGS(dd, sel, vl, idx),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, sel)
+                            __field(u8, vl)
+                            __field(u8, idx)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->sel = sel;
+                          __entry->vl = vl;
+                          __entry->idx = idx;
+                          ),
+           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sel,
+                     __entry->vl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+                   TP_PROTO(struct sdma_engine *sde, u64 status),
+                   TP_ARGS(sde, status),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(u64, status)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->status = status;
+                                  __entry->idx = sde->this_idx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) status %llx",
+                             __get_str(dev),
+                             __entry->idx,
+                             (unsigned long long)__entry->status
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+                   TP_PROTO(struct sdma_engine *sde, int aidx),
+                   TP_ARGS(sde, aidx),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(int, aidx)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->idx = sde->this_idx;
+                                  __entry->aidx = aidx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) aidx %d",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->aidx
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead,
+                    u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                            __field(u64, sn)
+                            __field(u16, hwhead)
+                            __field(u16, swhead)
+                            __field(u16, txnext)
+                            __field(u16, tx_tail)
+                            __field(u16, tx_head)
+                            __field(u8, idx)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                          __entry->hwhead = hwhead;
+                          __entry->swhead = swhead;
+                          __entry->tx_tail = sde->tx_tail;
+                          __entry->tx_head = sde->tx_head;
+                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                          __entry->idx = sde->this_idx;
+                          __entry->sn = txp ? txp->sn : ~0;
+                          ),
+           TP_printk(
+                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sn,
+                     __entry->hwhead,
+                     __entry->swhead,
+                     __entry->txnext,
+                     __entry->tx_head,
+                     __entry->tx_tail
+                     )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead, u16 swhead,
+                    struct sdma_txreq *txp
+           ),
+       TP_ARGS(sde, hwhead, swhead, txp),
+       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                        __field(u16, hwhead)
+                        __field(u16, swhead)
+                        __field(u16, txnext)
+                        __field(u16, tx_tail)
+                        __field(u16, tx_head)
+                        __field(u8, idx)
+                        ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __entry->hwhead = hwhead;
+                      __entry->swhead = swhead;
+                      __entry->tx_tail = sde->tx_tail;
+                      __entry->tx_head = sde->tx_head;
+                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                      __entry->idx = sde->this_idx;
+                      ),
+       TP_printk(
+                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                 __get_str(dev),
+                 __entry->idx,
+                 __entry->hwhead,
+                 __entry->swhead,
+                 __entry->txnext,
+                 __entry->tx_head,
+                 __entry->tx_tail
+                 )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+                   TP_PROTO(struct sdma_engine *sde, u64 sn),
+                   TP_ARGS(sde, sn),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                                    __field(u64, sn)
+                                    __field(u8, idx)
+                                    ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                                  __entry->sn = sn;
+                                  __entry->idx = sde->this_idx;
+                                  ),
+                   TP_printk("[%s] SDE(%u) sn %llu",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->sn
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+            TP_PROTO(struct sdma_engine *sde, u64 sn),
+            TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    struct hfi1_pkt_header *hdr, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(__le32, pbc0)
+                   __field(__le32, pbc1)
+                   __field(__be32, lrh0)
+                   __field(__be32, lrh1)
+                   __field(__be32, bth0)
+                   __field(__be32, bth1)
+                   __field(__be32, bth2)
+                   __field(__le32, kdeth0)
+                   __field(__le32, kdeth1)
+                   __field(__le32, kdeth2)
+                   __field(__le32, kdeth3)
+                   __field(__le32, kdeth4)
+                   __field(__le32, kdeth5)
+                   __field(__le32, kdeth6)
+                   __field(__le32, kdeth7)
+                   __field(__le32, kdeth8)
+                   __field(u32, tidval)
+                   ),
+           TP_fast_assign(
+                   __le32 *pbc = (__le32 *)hdr->pbc;
+                   __be32 *lrh = (__be32 *)hdr->lrh;
+                   __be32 *bth = (__be32 *)hdr->bth;
+                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->pbc0 = pbc[0];
+                   __entry->pbc1 = pbc[1];
+                   __entry->lrh0 = be32_to_cpu(lrh[0]);
+                   __entry->lrh1 = be32_to_cpu(lrh[1]);
+                   __entry->bth0 = be32_to_cpu(bth[0]);
+                   __entry->bth1 = be32_to_cpu(bth[1]);
+                   __entry->bth2 = be32_to_cpu(bth[2]);
+                   __entry->kdeth0 = kdeth[0];
+                   __entry->kdeth1 = kdeth[1];
+                   __entry->kdeth2 = kdeth[2];
+                   __entry->kdeth3 = kdeth[3];
+                   __entry->kdeth4 = kdeth[4];
+                   __entry->kdeth5 = kdeth[5];
+                   __entry->kdeth6 = kdeth[6];
+                   __entry->kdeth7 = kdeth[7];
+                   __entry->kdeth8 = kdeth[8];
+                   __entry->tidval = tidval;
+                   ),
+           TP_printk(USDMA_HDR_FORMAT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->pbc1,
+                     __entry->pbc0,
+                     __entry->lrh0,
+                     __entry->lrh1,
+                     __entry->bth0,
+                     __entry->bth1,
+                     __entry->bth2,
+                     __entry->kdeth0,
+                     __entry->kdeth1,
+                     __entry->kdeth2,
+                     __entry->kdeth3,
+                     __entry->kdeth4,
+                     __entry->kdeth5,
+                     __entry->kdeth6,
+                     __entry->kdeth7,
+                     __entry->kdeth8,
+                     __entry->tidval
+                   )
+       );
+
+#define SDMA_UREQ_FMT \
+       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+           TP_ARGS(dd, ctxt, subctxt, i),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd);
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u8, ver_opcode)
+                   __field(u8, iovcnt)
+                   __field(u16, npkts)
+                   __field(u16, fragsize)
+                   __field(u16, comp_idx)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->ver_opcode = i[0] & 0xff;
+                   __entry->iovcnt = (i[0] >> 8) & 0xff;
+                   __entry->npkts = i[1];
+                   __entry->fragsize = i[2];
+                   __entry->comp_idx = i[3];
+                   ),
+           TP_printk(SDMA_UREQ_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->ver_opcode,
+                     __entry->iovcnt,
+                     __entry->npkts,
+                     __entry->fragsize,
+                     __entry->comp_idx
+                   )
+       );
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)                  \
+       __print_symbolic(st,                            \
+                        usdma_complete_name(FREE),     \
+                        usdma_complete_name(QUEUED),   \
+                        usdma_complete_name(COMPLETE), \
+                        usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+                    u8 state, int code),
+           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, idx)
+                   __field(u8, state)
+                   __field(int, code)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->idx = idx;
+                   __entry->state = state;
+                   __entry->code = code;
+                   ),
+           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+                     __get_str(dev), __entry->ctxt, __entry->subctxt,
+                     __entry->idx, show_usdma_complete_state(__entry->state),
+                     __entry->code)
+       );
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(u8, sde)
+                   __field(u8, idx)
+                   __field(int, len)
+                   __field(u32, tidval)
+                   __array(u32, ahg, 10)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->sde = sde;
+                   __entry->idx = ahgidx;
+                   __entry->len = len;
+                   __entry->tidval = tidval;
+                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
+                   ),
+           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->sde,
+                     __entry->idx,
+                     __entry->len - 1,
+                     __print_u32_hex(__entry->ahg, __entry->len),
+                     __entry->tidval
+                   )
+       );
+
+TRACE_EVENT(hfi1_sdma_state,
+           TP_PROTO(struct sdma_engine *sde,
+                    const char *cstate,
+                    const char *nstate
+                    ),
+           TP_ARGS(sde, cstate, nstate),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                            __string(curstate, cstate)
+                            __string(newstate, nstate)
+                            ),
+       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                      __assign_str(curstate, cstate);
+                      __assign_str(newstate, nstate);
+                      ),
+       TP_printk("[%s] current state %s new state %s",
+                 __get_str(dev),
+                 __get_str(curstate),
+                 __get_str(newstate)
+                 )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 psn),
+                   TP_ARGS(qp, psn),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                       __field(u32, qpn)
+                       __field(u32, s_flags)
+                       __field(u32, psn)
+                       __field(u32, s_psn)
+                       __field(u32, s_next_psn)
+                       __field(u32, s_sending_psn)
+                       __field(u32, s_sending_hpsn)
+                       __field(u32, r_psn)
+                       ),
+                   TP_fast_assign(
+                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                       __entry->qpn = qp->ibqp.qp_num;
+                       __entry->s_flags = qp->s_flags;
+                       __entry->psn = psn;
+                       __entry->s_psn = qp->s_psn;
+                       __entry->s_next_psn = qp->s_next_psn;
+                       __entry->s_sending_psn = qp->s_sending_psn;
+                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
+                       __entry->r_psn = qp->r_psn;
+                       ),
+                   TP_printk(
+                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+                       __get_str(dev),
+                       __entry->qpn,
+                       __entry->s_flags,
+                       __entry->psn,
+                       __entry->s_psn,
+                       __entry->s_next_psn,
+                       __entry->s_sending_psn,
+                       __entry->s_sending_hpsn,
+                       __entry->r_psn
+                       )
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+                    int src),
+           TP_ARGS(dd, is_entry, src),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __array(char, buf, 64)
+                            __field(int, src)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd)
+                          is_entry->is_name(__entry->buf, 64,
+                                            src - is_entry->start);
+                          __entry->src = src;
+                          ),
+           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+                     __entry->src)
+);
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_trace
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+                   TP_PROTO(const char *function, struct va_format *vaf),
+                   TP_ARGS(function, vaf),
+                   TP_STRUCT__entry(__string(function, function)
+                                    __dynamic_array(char, msg, MAX_MSG_LEN)
+                                    ),
+                   TP_fast_assign(__assign_str(function, function);
+                                  WARN_ON_ONCE(vsnprintf
+                                               (__get_dynamic_array(msg),
+                                                MAX_MSG_LEN, vaf->fmt,
+                                                *vaf->va) >=
+                                               MAX_MSG_LEN);
+                                  ),
+                   TP_printk("(%s) %s",
+                             __get_str(function),
+                             __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+                                                                       \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
+       TP_PROTO(const char *function, struct va_format *vaf),          \
+       TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+{                                                                      \
+       struct va_format vaf = {                                        \
+               .fmt = fmt,                                             \
+       };                                                              \
+       va_list args;                                                   \
+                                                                       \
+       va_start(args, fmt);                                            \
+       vaf.va = &args;                                                 \
+       trace_hfi1_ ##lvl(func, &vaf);                                  \
+       va_end(args);                                                   \
+       return;                                                         \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+       trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c
new file mode 100644 (file)
index 0000000..e82e52a
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * "Two Wire Serial Interface" support.
+ *
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the hfi1_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
+{
+       /*
+        * implicit read of EXTStatus is as good as explicit
+        * read of scratch, if all we want to do is flush
+        * writes.
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, 0);
+       rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       udelay(1);
+
+       mask = QSFP_HFI0_I2CCLK;
+
+       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       /*
+        * Allow for slow slaves by simple
+        * delay for falling edge, sampling on rise.
+        */
+       if (!bit) {
+               udelay(2);
+       } else {
+               int rise_usec;
+
+               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
+                               break;
+                       udelay(2);
+               }
+               if (rise_usec <= 0)
+                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+                                  SCL_WAIT_USEC);
+       }
+       i2c_wait_for_writes(dd, target);
+}
+
+static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+       u32 read_val, mask;
+
+       mask = QSFP_HFI0_I2CCLK;
+       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+       if (wait)
+               i2c_wait_for_writes(dd, target);
+       return (read_val & mask) >> GPIO_SCL_NUM;
+}
+
+static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       i2c_wait_for_writes(dd, target);
+       udelay(2);
+}
+
+static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+       u32 read_val, mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+       if (wait)
+               i2c_wait_for_writes(dd, target);
+       return (read_val & mask) >> GPIO_SDA_NUM;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the hfi1_ib device
+ */
+static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
+{
+       u8 ack_received;
+
+       /* AT ENTRY SCL = LOW */
+       /* change direction, ignore data */
+       ack_received = sda_in(dd, target, 1);
+       scl_out(dd, target, 1);
+       ack_received = sda_in(dd, target, 1) == 0;
+       scl_out(dd, target, 0);
+       return ack_received;
+}
+
+static void stop_cmd(struct hfi1_devdata *dd, u32 target);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the hfi1_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
+{
+       int bit_cntr, data;
+
+       data = 0;
+
+       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+               data <<= 1;
+               scl_out(dd, target, 1);
+               data |= sda_in(dd, target, 0);
+               scl_out(dd, target, 0);
+       }
+       if (last) {
+               scl_out(dd, target, 1);
+               stop_cmd(dd, target);
+       } else {
+               sda_out(dd, target, 0);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+               sda_out(dd, target, 1);
+       }
+       return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the hfi1_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
+{
+       int bit_cntr;
+       u8 bit;
+
+       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+               bit = (data >> bit_cntr) & 1;
+               sda_out(dd, target, bit);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+       }
+       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct hfi1_devdata *dd, u32 target)
+{
+       sda_out(dd, target, 1);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 0);
+       udelay(1);
+       scl_out(dd, target, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct hfi1_devdata *dd, u32 target)
+{
+       scl_out(dd, target, 0);
+       sda_out(dd, target, 0);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct hfi1_devdata *dd, u32 target)
+{
+       stop_seq(dd, target);
+       udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * hfi1_twsi_reset - reset I2C communication
+ * @dd: the hfi1_ib device
+ * returns 0 if ok, -EIO on error
+ */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
+{
+       int clock_cycles_left = 9;
+       u32 mask;
+
+       /* Both SCL and SDA should be high. If not, there
+        * is something wrong.
+        */
+       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
+
+       /*
+        * Force pins to desired innocuous state.
+        * This is the default power-on state with out=0 and dir=0,
+        * So tri-stated and should be floating high (barring HW problems)
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+
+       /* Check if SCL is low, if it is low then we have a slave device
+        * misbehaving and there is not much we can do.
+        */
+       if (!scl_in(dd, target, 0))
+               return -EIO;
+
+       /* Check if SDA is low, if it is low then we have to clock SDA
+        * up to 9 times for the device to release the bus
+        */
+       while (clock_cycles_left--) {
+               if (sda_in(dd, target, 0))
+                       return 0;
+               scl_out(dd, target, 0);
+               scl_out(dd, target, 1);
+       }
+
+       return -EIO;
+}
+
+#define HFI1_TWSI_START 0x100
+#define HFI1_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
+{
+       int ret = 1;
+
+       if (flags & HFI1_TWSI_START)
+               start_seq(dd, target);
+
+       /* Leaves SCL low (from i2c_ackrcv()) */
+       ret = wr_byte(dd, target, data);
+
+       if (flags & HFI1_TWSI_STOP)
+               stop_cmd(dd, target);
+       return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define HFI1_TEMP_DEV 0x98
+
+/*
+ * hfi1_twsi_blk_rd
+ * General interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a N-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len)
+{
+       u8 *bp = buffer;
+       int ret = 1;
+       int i;
+       int offset_size;
+
+       /* obtain the offset size, strip it from the device address */
+       offset_size = (dev >> 8) & 0xff;
+       dev &= 0xff;
+
+       /* allow at most a 2 byte offset */
+       if (offset_size > 2)
+               goto bail;
+
+       if (dev == HFI1_TWSI_NO_DEV) {
+               /* legacy not-really-I2C */
+               addr = (addr << 1) | READ_CMD;
+               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
+       } else {
+               /* Actual I2C */
+               if (offset_size) {
+                       ret = twsi_wr(dd, target,
+                                     dev | WRITE_CMD, HFI1_TWSI_START);
+                       if (ret) {
+                               stop_cmd(dd, target);
+                               goto bail;
+                       }
+
+                       for (i = 0; i < offset_size; i++) {
+                               ret = twsi_wr(dd, target,
+                                             (addr >> (i * 8)) & 0xff, 0);
+                               udelay(TWSI_BUF_WAIT_USEC);
+                               if (ret) {
+                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
+                                                  i, addr);
+                                       goto bail;
+                               }
+                       }
+               }
+               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
+       }
+       if (ret) {
+               stop_cmd(dd, target);
+               goto bail;
+       }
+
+       /*
+        * block devices keeps clocking data out as long as we ack,
+        * automatically incrementing the address. Some have "pages"
+        * whose boundaries will not be crossed, but the handling
+        * of these is left to the caller, who is in a better
+        * position to know.
+        */
+       while (len-- > 0) {
+               /*
+                * Get and store data, sending ACK if length remaining,
+                * else STOP
+                */
+               *bp++ = rd_byte(dd, target, !len);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/*
+ * hfi1_twsi_blk_wr
+ * General interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a N-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len)
+{
+       const u8 *bp = buffer;
+       int ret = 1;
+       int i;
+       int offset_size;
+
+       /* obtain the offset size, strip it from the device address */
+       offset_size = (dev >> 8) & 0xff;
+       dev &= 0xff;
+
+       /* allow at most a 2 byte offset */
+       if (offset_size > 2)
+               goto bail;
+
+       if (dev == HFI1_TWSI_NO_DEV) {
+               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
+                           HFI1_TWSI_START)) {
+                       goto failed_write;
+               }
+       } else {
+               /* Real I2C */
+               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
+                       goto failed_write;
+       }
+
+       for (i = 0; i < offset_size; i++) {
+               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
+               udelay(TWSI_BUF_WAIT_USEC);
+               if (ret) {
+                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
+                                  i, addr);
+                       goto bail;
+               }
+       }
+
+       for (i = 0; i < len; i++)
+               if (twsi_wr(dd, target, *bp++, 0))
+                       goto failed_write;
+
+       ret = 0;
+
+failed_write:
+       stop_cmd(dd, target);
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h
new file mode 100644 (file)
index 0000000..5b8a5b5
--- /dev/null
@@ -0,0 +1,65 @@
+#ifndef _TWSI_H
+#define _TWSI_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define HFI1_TWSI_NO_DEV 0xFF
+
+struct hfi1_devdata;
+
+/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
+#define  GPIO_SDA_NUM 1
+#define  GPIO_SCL_NUM 0
+
+/* these functions must be called with qsfp_lock held */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len);
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len);
+
+#endif /* _TWSI_H */
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
new file mode 100644 (file)
index 0000000..df773d4
--- /dev/null
@@ -0,0 +1,604 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/* only opcode mask for adaptive pio */
+const u32 uc_only_opcode =
+       BIT(OP(SEND_ONLY) & 0x1f) |
+       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct rvt_swqe *wqe;
+       u32 hwords = 5;
+       u32 bth0 = 0;
+       u32 len;
+       u32 pmtu = qp->pmtu;
+       int middle = 0;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done_free_tx;
+       }
+
+       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+       /* Get the next send request. */
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       qp->s_wqe = NULL;
+       switch (qp->s_state) {
+       default:
+               if (!(ib_rvt_state_ops[qp->state] &
+                   RVT_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /* Check if send work queue is empty. */
+               smp_read_barrier_depends(); /* see post_one_send() */
+               if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
+                       clear_ahg(qp);
+                       goto bail;
+               }
+               /*
+                * Start a new request.
+                */
+               qp->s_psn = wqe->psn;
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               len = wqe->length;
+               qp->s_len = len;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND) {
+                               qp->s_state = OP(SEND_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->rdma_wr.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->rdma_wr.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       } else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               break;
+
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND) {
+                       qp->s_state = OP(SEND_LAST);
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               } else {
+                       qp->s_state =
+                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       ps->s_txreq->sde = priv->s_sde;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+                            mask_psn(qp->s_psn++), middle, ps);
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 bth0, opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       struct ib_reth *reth;
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       int ret;
+       u32 bth1;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u32 rqpn, lqpn;
+                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
+                       u8 sl, sc5;
+
+                       lqpn = bth1 & RVT_QPN_MASK;
+                       rqpn = qp->remote_qpn;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       sl = ibp->sc_to_sl[sc5];
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn,
+                                    IB_CC_SVCTYPE_UC);
+               }
+
+               if (bth1 & HFI1_FECN_SMASK) {
+                       struct ib_grh *grh = NULL;
+                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+                       u16 slid = be16_to_cpu(hdr->lrh[3]);
+                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
+                       u32 src_qp = qp->remote_qpn;
+                       u8 sc5;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       if (has_grh)
+                               grh = &hdr->u.l.grh;
+
+                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
+                                  grh);
+               }
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = (bth0 >> 24) & 0xff;
+
+       /* Compare the PSN verses the expected PSN. */
+       if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+               /*
+                * Handle a sequence error.
+                * Silently drop any current message.
+                */
+               qp->r_psn = psn;
+inv:
+               if (qp->r_state == OP(SEND_FIRST) ||
+                   qp->r_state == OP(SEND_MIDDLE)) {
+                       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+                       qp->r_sge.num_sge = 0;
+               } else {
+                       rvt_put_ss(&qp->r_sge);
+               }
+               qp->r_state = OP(SEND_LAST);
+               switch (opcode) {
+               case OP(SEND_FIRST):
+               case OP(SEND_ONLY):
+               case OP(SEND_ONLY_WITH_IMMEDIATE):
+                       goto send_first;
+
+               case OP(RDMA_WRITE_FIRST):
+               case OP(RDMA_WRITE_ONLY):
+               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+                       goto rdma_first;
+
+               default:
+                       goto drop;
+               }
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       default:
+               if (opcode == OP(SEND_FIRST) ||
+                   opcode == OP(SEND_ONLY) ||
+                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_FIRST) ||
+                   opcode == OP(RDMA_WRITE_ONLY) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+                       qp->r_sge = qp->s_rdma_read_sge;
+               } else {
+                       ret = hfi1_rvt_get_rwqe(qp, 0);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+                       /*
+                        * qp->s_rdma_read_sge will be the owner
+                        * of the mr references.
+                        */
+                       qp->s_rdma_read_sge = qp->r_sge;
+               }
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+                       goto send_last_imm;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto rewind;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto rewind;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0);
+               break;
+
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(SEND_LAST):
+no_immediate_data:
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto rewind;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto rewind;
+               wc.opcode = IB_WC_RECV;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0);
+               rvt_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                            (ohdr->bth[0] &
+                             cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE))) {
+                       goto drop;
+               }
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey */
+                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+                                        vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto drop;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_ONLY)) {
+                       goto rdma_last;
+               } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+                       wc.ex.imm_data = ohdr->u.rc.imm_data;
+                       goto rdma_last_imm;
+               }
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto drop;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+               wc.wc_flags = IB_WC_WITH_IMM;
+
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+                       rvt_put_ss(&qp->s_rdma_read_sge);
+               } else {
+                       ret = hfi1_rvt_get_rwqe(qp, 1);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+               }
+               wc.byte_len = qp->r_len;
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+               rvt_put_ss(&qp->r_sge);
+               goto last_imm;
+
+       case OP(RDMA_WRITE_LAST):
+rdma_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+               rvt_put_ss(&qp->r_sge);
+               break;
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               goto drop;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       return;
+
+rewind:
+       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+       qp->r_sge.num_sge = 0;
+drop:
+       ibp->rvp.n_pkt_drops++;
+       return;
+
+op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
new file mode 100644 (file)
index 0000000..1e503ad
--- /dev/null
@@ -0,0 +1,911 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct hfi1_pportdata *ppd;
+       struct rvt_qp *qp;
+       struct ib_ah_attr *ah_attr;
+       unsigned long flags;
+       struct rvt_sge_state ssge;
+       struct rvt_sge *sge;
+       struct ib_wc wc;
+       u32 length;
+       enum ib_qp_type sqptype, dqptype;
+
+       rcu_read_lock();
+
+       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+                           swqe->ud_wr.remote_qpn);
+       if (!qp) {
+               ibp->rvp.n_pkt_drops++;
+               rcu_read_unlock();
+               return;
+       }
+
+       sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : sqp->ibqp.qp_type;
+       dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : qp->ibqp.qp_type;
+
+       if (dqptype != sqptype ||
+           !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+               ibp->rvp.n_pkt_drops++;
+               goto drop;
+       }
+
+       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+       ppd = ppd_from_ibp(ibp);
+
+       if (qp->ibqp.qp_num > 1) {
+               u16 pkey;
+               u16 slid;
+               u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+               pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+               slid = ppd->lid | (ah_attr->src_path_bits &
+                                  ((1 << ppd->lmc) - 1));
+               if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+                                               qp->s_pkey_index, slid))) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      slid, ah_attr->dlid);
+                       goto drop;
+               }
+       }
+
+       /*
+        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       if (qp->ibqp.qp_num) {
+               u32 qkey;
+
+               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+                       sqp->qkey : swqe->ud_wr.remote_qkey;
+               if (unlikely(qkey != qp->qkey)) {
+                       u16 lid;
+
+                       lid = ppd->lid | (ah_attr->src_path_bits &
+                                         ((1 << ppd->lmc) - 1));
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      lid,
+                                      ah_attr->dlid);
+                       goto drop;
+               }
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       length = swqe->length;
+       memset(&wc, 0, sizeof(wc));
+       wc.byte_len = length + sizeof(struct ib_grh);
+
+       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = swqe->wr.ex.imm_data;
+       }
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & RVT_R_REUSE_SGE) {
+               qp->r_flags &= ~RVT_R_REUSE_SGE;
+       } else {
+               int ret;
+
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       goto bail_unlock;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->rvp.n_vl15_dropped++;
+                       goto bail_unlock;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= RVT_R_REUSE_SGE;
+               ibp->rvp.n_pkt_drops++;
+               goto bail_unlock;
+       }
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
+                             sizeof(struct ib_grh), 1, 0);
+               wc.wc_flags |= IB_WC_GRH;
+       } else {
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       }
+       ssge.sg_list = swqe->sg_list + 1;
+       ssge.sge = *swqe->sg_list;
+       ssge.num_sge = swqe->wr.num_sge;
+       sge = &ssge.sge;
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ssge.num_sge)
+                               *sge = *ssge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+       rvt_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               goto bail_unlock;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = sqp->ibqp.qp_num;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+                   sqp->ibqp.qp_type == IB_QPT_SMI)
+                       wc.pkey_index = swqe->ud_wr.pkey_index;
+               else
+                       wc.pkey_index = sqp->s_pkey_index;
+       } else {
+               wc.pkey_index = 0;
+       }
+       wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+       /* Check for loopback when the port lid is not set */
+       if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+               wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
+       wc.sl = ah_attr->sl;
+       wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    swqe->wr.send_flags & IB_SEND_SOLICITED);
+       ibp->rvp.n_loop_pkts++;
+bail_unlock:
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct ib_ah_attr *ah_attr;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct rvt_swqe *wqe;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u16 lrh0;
+       u16 lid;
+       int next_cur;
+       u8 sc5;
+
+       ps->s_txreq = get_txreq(ps->dev, qp);
+       if (IS_ERR(ps->s_txreq))
+               goto bail_no_tx;
+
+       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               smp_read_barrier_depends(); /* see post_one_send */
+               if (qp->s_last == ACCESS_ONCE(qp->s_head))
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (iowait_sdma_pending(&priv->s_iowait)) {
+                       qp->s_flags |= RVT_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done_free_tx;
+       }
+
+       /* see post_one_send() */
+       smp_read_barrier_depends();
+       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+               goto bail;
+
+       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+       next_cur = qp->s_cur + 1;
+       if (next_cur >= qp->s_size)
+               next_cur = 0;
+
+       /* Construct the header. */
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       ppd = ppd_from_ibp(ibp);
+       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+           ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+               lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+               if (unlikely(!loopback &&
+                            (lid == ppd->lid ||
+                             (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
+                             qp->ibqp.qp_type == IB_QPT_GSI)))) {
+                       unsigned long tflags = ps->flags;
+                       /*
+                        * If DMAs are in progress, we can't generate
+                        * a completion for the loopback packet since
+                        * it would be out of order.
+                        * Instead of waiting, we could queue a
+                        * zero length descriptor so we get a callback.
+                        */
+                       if (iowait_sdma_pending(&priv->s_iowait)) {
+                               qp->s_flags |= RVT_S_WAIT_DMA;
+                               goto bail;
+                       }
+                       qp->s_cur = next_cur;
+                       spin_unlock_irqrestore(&qp->s_lock, tflags);
+                       ud_loopback(qp, wqe);
+                       spin_lock_irqsave(&qp->s_lock, tflags);
+                       ps->flags = tflags;
+                       hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+                       goto done_free_tx;
+               }
+       }
+
+       qp->s_cur = next_cur;
+       extra_bytes = -wqe->length & 3;
+       nwords = (wqe->length + extra_bytes) >> 2;
+
+       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+       qp->s_hdrwords = 7;
+       qp->s_cur_size = wqe->length;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_srate = ah_attr->static_rate;
+       qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+       qp->s_wqe = wqe;
+       qp->s_sge.sge = wqe->sg_list[0];
+       qp->s_sge.sg_list = wqe->sg_list + 1;
+       qp->s_sge.num_sge = wqe->wr.num_sge;
+       qp->s_sge.total_len = wqe->length;
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               /* Header size in 32-bit words. */
+               qp->s_hdrwords += hfi1_make_grh(ibp,
+                                               &ps->s_txreq->phdr.hdr.u.l.grh,
+                                               &ah_attr->grh,
+                                               qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+               /*
+                * Don't worry about sending to locally attached multicast
+                * QPs.  It is unspecified by the spec. what happens.
+                */
+       } else {
+               /* Header size in 32-bit words. */
+               lrh0 = HFI1_LRH_BTH;
+               ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+       }
+       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               qp->s_hdrwords++;
+               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+       } else {
+               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+       }
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       lrh0 |= (ah_attr->sl & 0xf) << 4;
+       if (qp->ibqp.qp_type == IB_QPT_SMI) {
+               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+               priv->s_sc = 0xf;
+       } else {
+               lrh0 |= (sc5 & 0xf) << 12;
+               priv->s_sc = sc5;
+       }
+       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+       ps->s_txreq->sde = priv->s_sde;
+       priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+       ps->s_txreq->psc = priv->s_sendcontext;
+       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);
+       ps->s_txreq->phdr.hdr.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+               ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+       } else {
+               lid = ppd->lid;
+               if (lid) {
+                       lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+                       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid);
+               } else {
+                       ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+               }
+       }
+       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+               bth0 |= IB_BTH_SOLICITED;
+       bth0 |= extra_bytes << 20;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+               bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+       else
+               bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
+       /*
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+                                        qp->qkey : wqe->ud_wr.remote_qkey);
+       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+       /* disarm any ahg */
+       priv->s_hdr->ahgcount = 0;
+       priv->s_hdr->ahgidx = 0;
+       priv->s_hdr->tx_flags = 0;
+       priv->s_hdr->sde = NULL;
+       /* pbc */
+       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+
+       return 1;
+
+done_free_tx:
+       hfi1_put_txreq(ps->s_txreq);
+       ps->s_txreq = NULL;
+       return 1;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+       ps->s_txreq = NULL;
+       qp->s_flags &= ~RVT_S_BUSY;
+       qp->s_hdrwords = 0;
+       return 0;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check.  It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       unsigned i;
+
+       if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+               unsigned lim_idx = -1;
+
+               for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+                       /* here we look for an exact match */
+                       if (ppd->pkeys[i] == pkey)
+                               return i;
+                       if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+                               lim_idx = i;
+               }
+
+               /* did not find 0xffff return 0x7fff idx if found */
+               if (pkey == FULL_MGMT_P_KEY)
+                       return lim_idx;
+
+               /* no match...  */
+               return -1;
+       }
+
+       pkey &= 0x7fff; /* remove limited/full membership bit */
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+               if ((ppd->pkeys[i] & 0x7fff) == pkey)
+                       return i;
+
+       /*
+        * Should not get here, this means hardware failed to validate pkeys.
+        */
+       return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh)
+{
+       u64 pbc, pbc_flags = 0;
+       u32 bth0, plen, vl, hwords = 5;
+       u16 lrh0;
+       u8 sl = ibp->sc_to_sl[sc5];
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+       struct pio_buf *pbuf;
+       struct send_context *ctxt = qp_to_send_context(qp, sc5);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       if (old_grh) {
+               struct ib_grh *grh = &hdr.u.l.grh;
+
+               grh->version_tclass_flow = old_grh->version_tclass_flow;
+               grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+               grh->hop_limit = 0xff;
+               grh->sgid = old_grh->dgid;
+               grh->dgid = old_grh->sgid;
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+               hwords += sizeof(struct ib_grh) / sizeof(u32);
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+
+       lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+       bth0 = pkey | (IB_OPCODE_CNP << 24);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+
+       ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+       ohdr->bth[2] = 0; /* PSN 0 */
+
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(slid);
+
+       plen = 2 /* PBC */ + hwords;
+       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       if (ctxt) {
+               pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+               if (pbuf)
+                       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+                                                &hdr, hwords);
+       }
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ *   - Checks are done using the pkey directly from the packet's BTH,
+ *     and specifically _not_ the pkey that we attach to the completion,
+ *     which may be different.
+ *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ *     which originated on another node). SMPs which are sent from, and
+ *     destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ *   - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+                        struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       /*
+        * I don't think it's possible for us to get here with sc != 0xf,
+        * but check it to be certain.
+        */
+       if (sc5 != 0xf)
+               return 1;
+
+       if (rcv_pkey_check(ppd, pkey, sc5, slid))
+               return 1;
+
+       /*
+        * At this point we know (and so don't need to check again) that
+        * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+        * (see ingress_pkey_check).
+        */
+       if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+           smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+               ingress_pkey_table_fail(ppd, pkey, slid);
+               return 1;
+       }
+
+       /*
+        * SMPs fall into one of four (disjoint) categories:
+        * SMA request, SMA response, trap, or trap repress.
+        * Our response depends, in part, on which type of
+        * SMP we're processing.
+        *
+        * If this is not an SMA request, or trap repress:
+        *   - accept MAD if the port is running an SM
+        *   - pkey == FULL_MGMT_P_KEY =>
+        *       reply with unsupported method (i.e., just mark
+        *       the smp's status field here, and let it be
+        *       processed normally)
+        *   - pkey != LIM_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        * If this is an SMA request or trap repress:
+        *   - pkey != FULL_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        */
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+       case IB_MGMT_METHOD_SET:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_TRAP_REPRESS:
+               if (pkey != FULL_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       case IB_MGMT_METHOD_SEND:
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+       case IB_MGMT_METHOD_REPORT_RESP:
+               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+                       return 0;
+               if (pkey == FULL_MGMT_P_KEY) {
+                       smp->status |= IB_SMP_UNSUP_METHOD;
+                       return 0;
+               }
+               if (pkey != LIM_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       int opcode;
+       u32 hdrsize = packet->hlen;
+       u32 pad;
+       struct ib_wc wc;
+       u32 qkey;
+       u32 src_qp;
+       u16 dlid, pkey;
+       int mgmt_pkey_idx = -1;
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct rvt_qp *qp = packet->qp;
+       bool has_grh = rcv_flags & HFI1_HAS_GRH;
+       bool sc4_bit = has_sc4_bit(packet);
+       u8 sc;
+       u32 bth1;
+       int is_mcast;
+       struct ib_grh *grh = NULL;
+
+       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+       dlid = be16_to_cpu(hdr->lrh[1]);
+       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
+               /*
+                * In pre-B0 h/w the CNP_OPCODE is handled via an
+                * error path.
+                */
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               u8 sl, sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+               sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
+       }
+
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       opcode &= 0xff;
+
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+       }
+       /*
+        * Get the number of bytes the message was padded by
+        * and drop incomplete packets.
+        */
+       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       if (unlikely(tlen < (hdrsize + pad + 4)))
+               goto drop;
+
+       tlen -= hdrsize + pad + 4;
+
+       /*
+        * Check that the permissive LID is only used on QP0
+        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+        */
+       if (qp->ibqp.qp_num) {
+               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                            hdr->lrh[3] == IB_LID_PERMISSIVE))
+                       goto drop;
+               if (qp->ibqp.qp_num > 1) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u16 slid;
+                       u8 sc5;
+
+                       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+                       sc5 |= sc4_bit;
+
+                       slid = be16_to_cpu(hdr->lrh[3]);
+                       if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+                               /*
+                                * Traps will not be sent for packets dropped
+                                * by the HW. This is fine, as sending trap
+                                * for invalid pkeys is optional according to
+                                * IB spec (release 1.3, section 10.9.4)
+                                */
+                               hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+                                              pkey,
+                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
+                                               0xF,
+                                              src_qp, qp->ibqp.qp_num,
+                                              be16_to_cpu(hdr->lrh[3]),
+                                              be16_to_cpu(hdr->lrh[1]));
+                               return;
+                       }
+               } else {
+                       /* GSI packet */
+                       mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+                       if (mgmt_pkey_idx < 0)
+                               goto drop;
+               }
+               if (unlikely(qkey != qp->qkey)) {
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      src_qp, qp->ibqp.qp_num,
+                                      be16_to_cpu(hdr->lrh[3]),
+                                      be16_to_cpu(hdr->lrh[1]));
+                       return;
+               }
+               /* Drop invalid MAD packets (see 13.5.3.1). */
+               if (unlikely(qp->ibqp.qp_num == 1 &&
+                            (tlen > 2048 ||
+                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+                       goto drop;
+       } else {
+               /* Received on QP0, and so by definition, this is an SMP */
+               struct opa_smp *smp = (struct opa_smp *)data;
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+                       goto drop;
+
+               if (tlen > 2048)
+                       goto drop;
+               if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                    hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+                   smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+                       goto drop;
+
+               /* look up SMI pkey */
+               mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+               if (mgmt_pkey_idx < 0)
+                       goto drop;
+       }
+
+       if (qp->ibqp.qp_num > 1 &&
+           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+               wc.ex.imm_data = ohdr->u.ud.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               tlen -= sizeof(u32);
+       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+       } else {
+               goto drop;
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       wc.byte_len = tlen + sizeof(struct ib_grh);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & RVT_R_REUSE_SGE) {
+               qp->r_flags &= ~RVT_R_REUSE_SGE;
+       } else {
+               int ret;
+
+               ret = hfi1_rvt_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       return;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->rvp.n_vl15_dropped++;
+                       return;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= RVT_R_REUSE_SGE;
+               goto drop;
+       }
+       if (has_grh) {
+               hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+                             sizeof(struct ib_grh), 1, 0);
+               wc.wc_flags |= IB_WC_GRH;
+       } else {
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       }
+       hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+                     1, 0);
+       rvt_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+               return;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.vendor_err = 0;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = src_qp;
+
+       if (qp->ibqp.qp_type == IB_QPT_GSI ||
+           qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (mgmt_pkey_idx < 0) {
+                       if (net_ratelimit()) {
+                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                               struct hfi1_devdata *dd = ppd->dd;
+
+                               dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+                                          qp->ibqp.qp_type);
+                               mgmt_pkey_idx = 0;
+                       }
+               }
+               wc.pkey_index = (unsigned)mgmt_pkey_idx;
+       } else {
+               wc.pkey_index = 0;
+       }
+
+       wc.slid = be16_to_cpu(hdr->lrh[3]);
+       sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       sc |= sc4_bit;
+       wc.sl = ibp->sc_to_sl[sc];
+
+       /*
+        * Save the LMC lower bits if the destination LID is a unicast LID.
+        */
+       wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
+               dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+                    (ohdr->bth[0] &
+                     cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+       return;
+
+drop:
+       ibp->rvp.n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
new file mode 100644 (file)
index 0000000..1b640a3
--- /dev/null
@@ -0,0 +1,1050 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <asm/page.h>
+
+#include "user_exp_rcv.h"
+#include "trace.h"
+#include "mmu_rb.h"
+
+struct tid_group {
+       struct list_head list;
+       unsigned base;
+       u8 size;
+       u8 used;
+       u8 map;
+};
+
+struct tid_rb_node {
+       struct mmu_rb_node mmu;
+       unsigned long phys;
+       struct tid_group *grp;
+       u32 rcventry;
+       dma_addr_t dma_addr;
+       bool freed;
+       unsigned npages;
+       struct page *pages[0];
+};
+
+struct tid_pageset {
+       u16 idx;
+       u16 count;
+};
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define num_user_pages(vaddr, len)                                    \
+       (1 + (((((unsigned long)(vaddr) +                              \
+                (unsigned long)(len) - 1) & PAGE_MASK) -              \
+              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
+                           struct rb_root *);
+static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
+static int set_rcvarray_entry(struct file *, unsigned long, u32,
+                             struct tid_group *, struct page **, unsigned);
+static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
+static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
+                         struct mm_struct *);
+static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
+                           struct tid_pageset *, unsigned, u16, struct page **,
+                           u32 *, unsigned *, unsigned *);
+static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
+static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
+
+static struct mmu_rb_ops tid_rb_ops = {
+       .insert = mmu_rb_insert,
+       .remove = mmu_rb_remove,
+       .invalidate = mmu_rb_invalidate
+};
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+       u32 pair = rcventry & ~0x1;
+
+       return EXP_TID_SET(IDX, pair >> 1) |
+               EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+static inline void exp_tid_group_init(struct exp_tid_set *set)
+{
+       INIT_LIST_HEAD(&set->list);
+       set->count = 0;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+                                   struct exp_tid_set *set)
+{
+       list_del_init(&grp->list);
+       set->count--;
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+                                     struct exp_tid_set *set)
+{
+       list_add_tail(&grp->list, &set->list);
+       set->count++;
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+       struct tid_group *grp =
+               list_first_entry(&set->list, struct tid_group, list);
+       list_del_init(&grp->list);
+       set->count--;
+       return grp;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+                                 struct exp_tid_set *s1,
+                                 struct exp_tid_set *s2)
+{
+       tid_group_remove(group, s1);
+       tid_group_add_tail(group, s2);
+}
+
+/*
+ * Initialize context and file private data needed for Expected
+ * receive caching. This needs to be done after the context has
+ * been configured with the eager/expected RcvEntry counts.
+ */
+int hfi1_user_exp_rcv_init(struct file *fp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned tidbase;
+       int i, ret = 0;
+
+       spin_lock_init(&fd->tid_lock);
+       spin_lock_init(&fd->invalid_lock);
+       fd->tid_rb_root = RB_ROOT;
+
+       if (!uctxt->subctxt_cnt || !fd->subctxt) {
+               exp_tid_group_init(&uctxt->tid_group_list);
+               exp_tid_group_init(&uctxt->tid_used_list);
+               exp_tid_group_init(&uctxt->tid_full_list);
+
+               tidbase = uctxt->expected_base;
+               for (i = 0; i < uctxt->expected_count /
+                            dd->rcv_entries.group_size; i++) {
+                       struct tid_group *grp;
+
+                       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+                       if (!grp) {
+                               /*
+                                * If we fail here, the groups already
+                                * allocated will be freed by the close
+                                * call.
+                                */
+                               ret = -ENOMEM;
+                               goto done;
+                       }
+                       grp->size = dd->rcv_entries.group_size;
+                       grp->base = tidbase;
+                       tid_group_add_tail(grp, &uctxt->tid_group_list);
+                       tidbase += dd->rcv_entries.group_size;
+               }
+       }
+
+       fd->entry_to_rb = kcalloc(uctxt->expected_count,
+                                    sizeof(struct rb_node *),
+                                    GFP_KERNEL);
+       if (!fd->entry_to_rb)
+               return -ENOMEM;
+
+       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
+               fd->invalid_tid_idx = 0;
+               fd->invalid_tids = kzalloc(uctxt->expected_count *
+                                          sizeof(u32), GFP_KERNEL);
+               if (!fd->invalid_tids) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+
+               /*
+                * Register MMU notifier callbacks. If the registration
+                * fails, continue but turn off the TID caching for
+                * all user contexts.
+                */
+               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
+               if (ret) {
+                       dd_dev_info(dd,
+                                   "Failed MMU notifier registration %d\n",
+                                   ret);
+                       HFI1_CAP_USET(TID_UNMAP);
+                       ret = 0;
+               }
+       }
+
+       /*
+        * PSM does not have a good way to separate, count, and
+        * effectively enforce a limit on RcvArray entries used by
+        * subctxts (when context sharing is used) when TID caching
+        * is enabled. To help with that, we calculate a per-process
+        * RcvArray entry share and enforce that.
+        * If TID caching is not in use, PSM deals with usage on its
+        * own. In that case, we allow any subctxt to take all of the
+        * entries.
+        *
+        * Make sure that we set the tid counts only after successful
+        * init.
+        */
+       spin_lock(&fd->tid_lock);
+       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
+               u16 remainder;
+
+               fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
+               remainder = uctxt->expected_count % uctxt->subctxt_cnt;
+               if (remainder && fd->subctxt < remainder)
+                       fd->tid_limit++;
+       } else {
+               fd->tid_limit = uctxt->expected_count;
+       }
+       spin_unlock(&fd->tid_lock);
+done:
+       return ret;
+}
+
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct tid_group *grp, *gptr;
+
+       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+               return 0;
+       /*
+        * The notifier would have been removed when the process'es mm
+        * was freed.
+        */
+       if (!HFI1_CAP_IS_USET(TID_UNMAP))
+               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
+
+       kfree(fd->invalid_tids);
+
+       if (!uctxt->cnt) {
+               if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
+                                       &fd->tid_rb_root);
+               if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
+                                       &fd->tid_rb_root);
+               list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
+                                        list) {
+                       list_del_init(&grp->list);
+                       kfree(grp);
+               }
+               hfi1_clear_tids(uctxt);
+       }
+
+       kfree(fd->entry_to_rb);
+       return 0;
+}
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+       /*
+        * Doing the WC fill writes only makes sense if the device is
+        * present and the RcvArray has been mapped as WC memory.
+        */
+       if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+               writeq(0, dd->rcvarray_wc + (index * 8));
+}
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ *
+ * The context keeps 3 lists of groups of RcvArray entries:
+ *   1. List of empty groups - tid_group_list
+ *      This list is created during user context creation and
+ *      contains elements which describe sets (of 8) of empty
+ *      RcvArray entries.
+ *   2. List of partially used groups - tid_used_list
+ *      This list contains sets of RcvArray entries which are
+ *      not completely used up. Another mapping request could
+ *      use some of all of the remaining entries.
+ *   3. List of full groups - tid_full_list
+ *      This is the list where sets that are completely used
+ *      up go.
+ *
+ * An attempt to optimize the usage of RcvArray entries is
+ * made by finding all sets of physically contiguous pages in a
+ * user's buffer.
+ * These physically contiguous sets are further split into
+ * sizes supported by the receive engine of the HFI. The
+ * resulting sets of pages are stored in struct tid_pageset,
+ * which describes the sets as:
+ *    * .count - number of pages in this set
+ *    * .idx - starting index into struct page ** array
+ *                    of this set
+ *
+ * From this point on, the algorithm deals with the page sets
+ * described above. The number of pagesets is divided by the
+ * RcvArray group size to produce the number of full groups
+ * needed.
+ *
+ * Groups from the 3 lists are manipulated using the following
+ * rules:
+ *   1. For each set of 8 pagesets, a complete group from
+ *      tid_group_list is taken, programmed, and moved to
+ *      the tid_full_list list.
+ *   2. For all remaining pagesets:
+ *      2.1 If the tid_used_list is empty and the tid_group_list
+ *          is empty, stop processing pageset and return only
+ *          what has been programmed up to this point.
+ *      2.2 If the tid_used_list is empty and the tid_group_list
+ *          is not empty, move a group from tid_group_list to
+ *          tid_used_list.
+ *      2.3 For each group is tid_used_group, program as much as
+ *          can fit into the group. If the group becomes fully
+ *          used, move it to tid_full_list.
+ */
+int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       int ret = 0, need_group = 0, pinned;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
+               tididx = 0, mapped, mapped_pages = 0;
+       unsigned long vaddr = tinfo->vaddr;
+       struct page **pages = NULL;
+       u32 *tidlist = NULL;
+       struct tid_pageset *pagesets = NULL;
+
+       /* Get the number of pages the user buffer spans */
+       npages = num_user_pages(vaddr, tinfo->length);
+       if (!npages)
+               return -EINVAL;
+
+       if (npages > uctxt->expected_count) {
+               dd_dev_err(dd, "Expected buffer too big\n");
+               return -EINVAL;
+       }
+
+       /* Verify that access is OK for the user buffer */
+       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+                      npages * PAGE_SIZE)) {
+               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+                          (void *)vaddr, npages);
+               return -EFAULT;
+       }
+
+       pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
+                          GFP_KERNEL);
+       if (!pagesets)
+               return -ENOMEM;
+
+       /* Allocate the array of struct page pointers needed for pinning */
+       pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+       if (!pages) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       /*
+        * Pin all the pages of the user buffer. If we can't pin all the
+        * pages, accept the amount pinned so far and program only that.
+        * User space knows how to deal with partially programmed buffers.
+        */
+       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
+       if (pinned <= 0) {
+               ret = pinned;
+               goto bail;
+       }
+       fd->tid_n_pinned += npages;
+
+       /* Find sets of physically contiguous pages */
+       npagesets = find_phys_blocks(pages, pinned, pagesets);
+
+       /*
+        * We don't need to access this under a lock since tid_used is per
+        * process and the same process cannot be in hfi1_user_exp_rcv_clear()
+        * and hfi1_user_exp_rcv_setup() at the same time.
+        */
+       spin_lock(&fd->tid_lock);
+       if (fd->tid_used + npagesets > fd->tid_limit)
+               pageset_count = fd->tid_limit - fd->tid_used;
+       else
+               pageset_count = npagesets;
+       spin_unlock(&fd->tid_lock);
+
+       if (!pageset_count)
+               goto bail;
+
+       ngroups = pageset_count / dd->rcv_entries.group_size;
+       tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
+       if (!tidlist) {
+               ret = -ENOMEM;
+               goto nomem;
+       }
+
+       tididx = 0;
+
+       /*
+        * From this point on, we are going to be using shared (between master
+        * and subcontexts) context resources. We need to take the lock.
+        */
+       mutex_lock(&uctxt->exp_lock);
+       /*
+        * The first step is to program the RcvArray entries which are complete
+        * groups.
+        */
+       while (ngroups && uctxt->tid_group_list.count) {
+               struct tid_group *grp =
+                       tid_group_pop(&uctxt->tid_group_list);
+
+               ret = program_rcvarray(fp, vaddr, grp, pagesets,
+                                      pageidx, dd->rcv_entries.group_size,
+                                      pages, tidlist, &tididx, &mapped);
+               /*
+                * If there was a failure to program the RcvArray
+                * entries for the entire group, reset the grp fields
+                * and add the grp back to the free group list.
+                */
+               if (ret <= 0) {
+                       tid_group_add_tail(grp, &uctxt->tid_group_list);
+                       hfi1_cdbg(TID,
+                                 "Failed to program RcvArray group %d", ret);
+                       goto unlock;
+               }
+
+               tid_group_add_tail(grp, &uctxt->tid_full_list);
+               ngroups--;
+               pageidx += ret;
+               mapped_pages += mapped;
+       }
+
+       while (pageidx < pageset_count) {
+               struct tid_group *grp, *ptr;
+               /*
+                * If we don't have any partially used tid groups, check
+                * if we have empty groups. If so, take one from there and
+                * put in the partially used list.
+                */
+               if (!uctxt->tid_used_list.count || need_group) {
+                       if (!uctxt->tid_group_list.count)
+                               goto unlock;
+
+                       grp = tid_group_pop(&uctxt->tid_group_list);
+                       tid_group_add_tail(grp, &uctxt->tid_used_list);
+                       need_group = 0;
+               }
+               /*
+                * There is an optimization opportunity here - instead of
+                * fitting as many page sets as we can, check for a group
+                * later on in the list that could fit all of them.
+                */
+               list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
+                                        list) {
+                       unsigned use = min_t(unsigned, pageset_count - pageidx,
+                                            grp->size - grp->used);
+
+                       ret = program_rcvarray(fp, vaddr, grp, pagesets,
+                                              pageidx, use, pages, tidlist,
+                                              &tididx, &mapped);
+                       if (ret < 0) {
+                               hfi1_cdbg(TID,
+                                         "Failed to program RcvArray entries %d",
+                                         ret);
+                               ret = -EFAULT;
+                               goto unlock;
+                       } else if (ret > 0) {
+                               if (grp->used == grp->size)
+                                       tid_group_move(grp,
+                                                      &uctxt->tid_used_list,
+                                                      &uctxt->tid_full_list);
+                               pageidx += ret;
+                               mapped_pages += mapped;
+                               need_group = 0;
+                               /* Check if we are done so we break out early */
+                               if (pageidx >= pageset_count)
+                                       break;
+                       } else if (WARN_ON(ret == 0)) {
+                               /*
+                                * If ret is 0, we did not program any entries
+                                * into this group, which can only happen if
+                                * we've screwed up the accounting somewhere.
+                                * Warn and try to continue.
+                                */
+                               need_group = 1;
+                       }
+               }
+       }
+unlock:
+       mutex_unlock(&uctxt->exp_lock);
+nomem:
+       hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
+                 mapped_pages, ret);
+       if (tididx) {
+               spin_lock(&fd->tid_lock);
+               fd->tid_used += tididx;
+               spin_unlock(&fd->tid_lock);
+               tinfo->tidcnt = tididx;
+               tinfo->length = mapped_pages * PAGE_SIZE;
+
+               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+                                tidlist, sizeof(tidlist[0]) * tididx)) {
+                       /*
+                        * On failure to copy to the user level, we need to undo
+                        * everything done so far so we don't leak resources.
+                        */
+                       tinfo->tidlist = (unsigned long)&tidlist;
+                       hfi1_user_exp_rcv_clear(fp, tinfo);
+                       tinfo->tidlist = 0;
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       }
+
+       /*
+        * If not everything was mapped (due to insufficient RcvArray entries,
+        * for example), unpin all unmapped pages so we can pin them nex time.
+        */
+       if (mapped_pages != pinned) {
+               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
+                                       pinned - mapped_pages,
+                                       false);
+               fd->tid_n_pinned -= pinned - mapped_pages;
+       }
+bail:
+       kfree(pagesets);
+       kfree(pages);
+       kfree(tidlist);
+       return ret > 0 ? 0 : ret;
+}
+
+int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       int ret = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       u32 *tidinfo;
+       unsigned tididx;
+
+       tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
+       if (!tidinfo)
+               return -ENOMEM;
+
+       if (copy_from_user(tidinfo, (void __user *)(unsigned long)
+                          tinfo->tidlist, sizeof(tidinfo[0]) *
+                          tinfo->tidcnt)) {
+               ret = -EFAULT;
+               goto done;
+       }
+
+       mutex_lock(&uctxt->exp_lock);
+       for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+               ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
+               if (ret) {
+                       hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
+                                 ret);
+                       break;
+               }
+       }
+       spin_lock(&fd->tid_lock);
+       fd->tid_used -= tididx;
+       spin_unlock(&fd->tid_lock);
+       tinfo->tidcnt = tididx;
+       mutex_unlock(&uctxt->exp_lock);
+done:
+       kfree(tidinfo);
+       return ret;
+}
+
+int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       unsigned long *ev = uctxt->dd->events +
+               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+                 HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+       u32 *array;
+       int ret = 0;
+
+       if (!fd->invalid_tids)
+               return -EINVAL;
+
+       /*
+        * copy_to_user() can sleep, which will leave the invalid_lock
+        * locked and cause the MMU notifier to be blocked on the lock
+        * for a long time.
+        * Copy the data to a local buffer so we can release the lock.
+        */
+       array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
+       if (!array)
+               return -EFAULT;
+
+       spin_lock(&fd->invalid_lock);
+       if (fd->invalid_tid_idx) {
+               memcpy(array, fd->invalid_tids, sizeof(*array) *
+                      fd->invalid_tid_idx);
+               memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+                      fd->invalid_tid_idx);
+               tinfo->tidcnt = fd->invalid_tid_idx;
+               fd->invalid_tid_idx = 0;
+               /*
+                * Reset the user flag while still holding the lock.
+                * Otherwise, PSM can miss events.
+                */
+               clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+       } else {
+               tinfo->tidcnt = 0;
+       }
+       spin_unlock(&fd->invalid_lock);
+
+       if (tinfo->tidcnt) {
+               if (copy_to_user((void __user *)tinfo->tidlist,
+                                array, sizeof(*array) * tinfo->tidcnt))
+                       ret = -EFAULT;
+       }
+       kfree(array);
+
+       return ret;
+}
+
+static u32 find_phys_blocks(struct page **pages, unsigned npages,
+                           struct tid_pageset *list)
+{
+       unsigned pagecount, pageidx, setcount = 0, i;
+       unsigned long pfn, this_pfn;
+
+       if (!npages)
+               return 0;
+
+       /*
+        * Look for sets of physically contiguous pages in the user buffer.
+        * This will allow us to optimize Expected RcvArray entry usage by
+        * using the bigger supported sizes.
+        */
+       pfn = page_to_pfn(pages[0]);
+       for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+               this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
+
+               /*
+                * If the pfn's are not sequential, pages are not physically
+                * contiguous.
+                */
+               if (this_pfn != ++pfn) {
+                       /*
+                        * At this point we have to loop over the set of
+                        * physically contiguous pages and break them down it
+                        * sizes supported by the HW.
+                        * There are two main constraints:
+                        *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+                        *        If the total set size is bigger than that
+                        *        program only a MAX_EXPECTED_BUFFER chunk.
+                        *     2. The buffer size has to be a power of two. If
+                        *        it is not, round down to the closes power of
+                        *        2 and program that size.
+                        */
+                       while (pagecount) {
+                               int maxpages = pagecount;
+                               u32 bufsize = pagecount * PAGE_SIZE;
+
+                               if (bufsize > MAX_EXPECTED_BUFFER)
+                                       maxpages =
+                                               MAX_EXPECTED_BUFFER >>
+                                               PAGE_SHIFT;
+                               else if (!is_power_of_2(bufsize))
+                                       maxpages =
+                                               rounddown_pow_of_two(bufsize) >>
+                                               PAGE_SHIFT;
+
+                               list[setcount].idx = pageidx;
+                               list[setcount].count = maxpages;
+                               pagecount -= maxpages;
+                               pageidx += maxpages;
+                               setcount++;
+                       }
+                       pageidx = i;
+                       pagecount = 1;
+                       pfn = this_pfn;
+               } else {
+                       pagecount++;
+               }
+       }
+       return setcount;
+}
+
+/**
+ * program_rcvarray() - program an RcvArray group with receive buffers
+ * @fp: file pointer
+ * @vaddr: starting user virtual address
+ * @grp: RcvArray group
+ * @sets: array of struct tid_pageset holding information on physically
+ *        contiguous chunks from the user buffer
+ * @start: starting index into sets array
+ * @count: number of struct tid_pageset's to program
+ * @pages: an array of struct page * for the user buffer
+ * @tidlist: the array of u32 elements when the information about the
+ *           programmed RcvArray entries is to be encoded.
+ * @tididx: starting offset into tidlist
+ * @pmapped: (output parameter) number of pages programmed into the RcvArray
+ *           entries.
+ *
+ * This function will program up to 'count' number of RcvArray entries from the
+ * group 'grp'. To make best use of write-combining writes, the function will
+ * perform writes to the unused RcvArray entries which will be ignored by the
+ * HW. Each RcvArray entry will be programmed with a physically contiguous
+ * buffer chunk from the user's virtual buffer.
+ *
+ * Return:
+ * -EINVAL if the requested count is larger than the size of the group,
+ * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
+ * number of RcvArray entries programmed.
+ */
+static int program_rcvarray(struct file *fp, unsigned long vaddr,
+                           struct tid_group *grp,
+                           struct tid_pageset *sets,
+                           unsigned start, u16 count, struct page **pages,
+                           u32 *tidlist, unsigned *tididx, unsigned *pmapped)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       u16 idx;
+       u32 tidinfo = 0, rcventry, useidx = 0;
+       int mapped = 0;
+
+       /* Count should never be larger than the group size */
+       if (count > grp->size)
+               return -EINVAL;
+
+       /* Find the first unused entry in the group */
+       for (idx = 0; idx < grp->size; idx++) {
+               if (!(grp->map & (1 << idx))) {
+                       useidx = idx;
+                       break;
+               }
+               rcv_array_wc_fill(dd, grp->base + idx);
+       }
+
+       idx = 0;
+       while (idx < count) {
+               u16 npages, pageidx, setidx = start + idx;
+               int ret = 0;
+
+               /*
+                * If this entry in the group is used, move to the next one.
+                * If we go past the end of the group, exit the loop.
+                */
+               if (useidx >= grp->size) {
+                       break;
+               } else if (grp->map & (1 << useidx)) {
+                       rcv_array_wc_fill(dd, grp->base + useidx);
+                       useidx++;
+                       continue;
+               }
+
+               rcventry = grp->base + useidx;
+               npages = sets[setidx].count;
+               pageidx = sets[setidx].idx;
+
+               ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
+                                        rcventry, grp, pages + pageidx,
+                                        npages);
+               if (ret)
+                       return ret;
+               mapped += npages;
+
+               tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+                       EXP_TID_SET(LEN, npages);
+               tidlist[(*tididx)++] = tidinfo;
+               grp->used++;
+               grp->map |= 1 << useidx++;
+               idx++;
+       }
+
+       /* Fill the rest of the group with "blank" writes */
+       for (; useidx < grp->size; useidx++)
+               rcv_array_wc_fill(dd, grp->base + useidx);
+       *pmapped = mapped;
+       return idx;
+}
+
+static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
+                             u32 rcventry, struct tid_group *grp,
+                             struct page **pages, unsigned npages)
+{
+       int ret;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct tid_rb_node *node;
+       struct hfi1_devdata *dd = uctxt->dd;
+       struct rb_root *root = &fd->tid_rb_root;
+       dma_addr_t phys;
+
+       /*
+        * Allocate the node first so we can handle a potential
+        * failure before we've programmed anything.
+        */
+       node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
+                      GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       phys = pci_map_single(dd->pcidev,
+                             __va(page_to_phys(pages[0])),
+                             npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+       if (dma_mapping_error(&dd->pcidev->dev, phys)) {
+               dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
+                          phys);
+               kfree(node);
+               return -EFAULT;
+       }
+
+       node->mmu.addr = vaddr;
+       node->mmu.len = npages * PAGE_SIZE;
+       node->phys = page_to_phys(pages[0]);
+       node->npages = npages;
+       node->rcventry = rcventry;
+       node->dma_addr = phys;
+       node->grp = grp;
+       node->freed = false;
+       memcpy(node->pages, pages, sizeof(struct page *) * npages);
+
+       if (HFI1_CAP_IS_USET(TID_UNMAP))
+               ret = mmu_rb_insert(root, &node->mmu);
+       else
+               ret = hfi1_mmu_rb_insert(root, &node->mmu);
+
+       if (ret) {
+               hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+                         node->rcventry, node->mmu.addr, node->phys, ret);
+               pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+                                PCI_DMA_FROMDEVICE);
+               kfree(node);
+               return -EFAULT;
+       }
+       hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
+       trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
+                              node->mmu.addr, node->phys, phys);
+       return 0;
+}
+
+static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
+                             struct tid_group **grp)
+{
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+       struct tid_rb_node *node;
+       u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+       u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+       if (tididx >= uctxt->expected_count) {
+               dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+                          tididx, uctxt->ctxt);
+               return -EINVAL;
+       }
+
+       if (tidctrl == 0x3)
+               return -EINVAL;
+
+       rcventry = tididx + (tidctrl - 1);
+
+       node = fd->entry_to_rb[rcventry];
+       if (!node || node->rcventry != (uctxt->expected_base + rcventry))
+               return -EBADF;
+       if (HFI1_CAP_IS_USET(TID_UNMAP))
+               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
+       else
+               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
+
+       if (grp)
+               *grp = node->grp;
+       clear_tid_node(fd, fd->subctxt, node);
+       return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
+                          struct tid_rb_node *node)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_devdata *dd = uctxt->dd;
+
+       trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
+                                node->npages, node->mmu.addr, node->phys,
+                                node->dma_addr);
+
+       hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
+       /*
+        * Make sure device has seen the write before we unpin the
+        * pages.
+        */
+       flush_wc();
+
+       pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
+                        PCI_DMA_FROMDEVICE);
+       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
+       fd->tid_n_pinned -= node->npages;
+
+       node->grp->used--;
+       node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+       if (node->grp->used == node->grp->size - 1)
+               tid_group_move(node->grp, &uctxt->tid_full_list,
+                              &uctxt->tid_used_list);
+       else if (!node->grp->used)
+               tid_group_move(node->grp, &uctxt->tid_used_list,
+                              &uctxt->tid_group_list);
+       kfree(node);
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+                           struct exp_tid_set *set, struct rb_root *root)
+{
+       struct tid_group *grp, *ptr;
+       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
+                                               tid_rb_root);
+       int i;
+
+       list_for_each_entry_safe(grp, ptr, &set->list, list) {
+               list_del_init(&grp->list);
+
+               for (i = 0; i < grp->size; i++) {
+                       if (grp->map & (1 << i)) {
+                               u16 rcventry = grp->base + i;
+                               struct tid_rb_node *node;
+
+                               node = fd->entry_to_rb[rcventry -
+                                                         uctxt->expected_base];
+                               if (!node || node->rcventry != rcventry)
+                                       continue;
+                               if (HFI1_CAP_IS_USET(TID_UNMAP))
+                                       mmu_rb_remove(&fd->tid_rb_root,
+                                                     &node->mmu, NULL);
+                               else
+                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
+                                                          &node->mmu);
+                               clear_tid_node(fd, -1, node);
+                       }
+               }
+       }
+}
+
+static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+       struct tid_rb_node *node =
+               container_of(mnode, struct tid_rb_node, mmu);
+
+       if (node->freed)
+               return 0;
+
+       trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+                                node->rcventry, node->npages, node->dma_addr);
+       node->freed = true;
+
+       spin_lock(&fdata->invalid_lock);
+       if (fdata->invalid_tid_idx < uctxt->expected_count) {
+               fdata->invalid_tids[fdata->invalid_tid_idx] =
+                       rcventry2tidinfo(node->rcventry - uctxt->expected_base);
+               fdata->invalid_tids[fdata->invalid_tid_idx] |=
+                       EXP_TID_SET(LEN, node->npages);
+               if (!fdata->invalid_tid_idx) {
+                       unsigned long *ev;
+
+                       /*
+                        * hfi1_set_uevent_bits() sets a user event flag
+                        * for all processes. Because calling into the
+                        * driver to process TID cache invalidations is
+                        * expensive and TID cache invalidations are
+                        * handled on a per-process basis, we can
+                        * optimize this to set the flag only for the
+                        * process in question.
+                        */
+                       ev = uctxt->dd->events +
+                               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+                                 HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
+                       set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+               }
+               fdata->invalid_tid_idx++;
+       }
+       spin_unlock(&fdata->invalid_lock);
+       return 0;
+}
+
+static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+       u32 base = fdata->uctxt->expected_base;
+
+       fdata->entry_to_rb[tnode->rcventry - base] = tnode;
+       return 0;
+}
+
+static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
+                         struct mm_struct *mm)
+{
+       struct hfi1_filedata *fdata =
+               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+       u32 base = fdata->uctxt->expected_base;
+
+       fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
new file mode 100644 (file)
index 0000000..9bc8d9f
--- /dev/null
@@ -0,0 +1,79 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)        \
+       (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)                      \
+       (((value) & EXP_TID_TID##field##_MASK) <<       \
+        EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({                                   \
+               (tid) &= ~(EXP_TID_TID##field##_MASK <<                 \
+                          EXP_TID_TID##field##_SHIFT);                 \
+               })
+#define EXP_TID_RESET(tid, field, value) do {                          \
+               EXP_TID_CLEAR(tid, field);                              \
+               (tid) |= EXP_TID_SET(field, (value));                   \
+       } while (0)
+
+int hfi1_user_exp_rcv_init(struct file *);
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
+int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
+
+#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
new file mode 100644 (file)
index 0000000..88e10b5
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+
+static unsigned long cache_size = 256;
+module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
+
+/*
+ * Determine whether the caller can pin pages.
+ *
+ * This function should be used in the implementation of buffer caches.
+ * The cache implementation should call this function prior to attempting
+ * to pin buffer pages in order to determine whether they should do so.
+ * The function computes cache limits based on the configured ulimit and
+ * cache size. Use of this function is especially important for caches
+ * which are not limited in any other way (e.g. by HW resources) and, thus,
+ * could keeping caching buffers.
+ *
+ */
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
+{
+       unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
+               size = (cache_size * (1UL << 20)); /* convert to bytes */
+       unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
+       bool can_lock = capable(CAP_IPC_LOCK);
+
+       /*
+        * Calculate per-cache size. The calculation below uses only a quarter
+        * of the available per-context limit. This leaves space for other
+        * pinning. Should we worry about shared ctxts?
+        */
+       cache_limit = (ulimit / usr_ctxts) / 4;
+
+       /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
+       if (ulimit != (-1UL) && size > cache_limit)
+               size = cache_limit;
+
+       /* Convert to number of pages */
+       size = DIV_ROUND_UP(size, PAGE_SIZE);
+
+       down_read(&current->mm->mmap_sem);
+       pinned = current->mm->pinned_vm;
+       up_read(&current->mm->mmap_sem);
+
+       /* First, check the absolute limit against all pinned pages. */
+       if (pinned + npages >= ulimit && !can_lock)
+               return false;
+
+       return ((nlocked + npages) <= size) || can_lock;
+}
+
+int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
+                           struct page **pages)
+{
+       int ret;
+
+       ret = get_user_pages_fast(vaddr, npages, writable, pages);
+       if (ret < 0)
+               return ret;
+
+       down_write(&current->mm->mmap_sem);
+       current->mm->pinned_vm += ret;
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+                            size_t npages, bool dirty)
+{
+       size_t i;
+
+       for (i = 0; i < npages; i++) {
+               if (dirty)
+                       set_page_dirty_lock(p[i]);
+               put_page(p[i]);
+       }
+
+       if (mm) { /* during close after signal, mm can be NULL */
+               down_write(&mm->mmap_sem);
+               mm->pinned_vm -= npages;
+               up_write(&mm->mmap_sem);
+       }
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
new file mode 100644 (file)
index 0000000..29f4795
--- /dev/null
@@ -0,0 +1,1625 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "verbs.h"  /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+#include "mmu_rb.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+       (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+       (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+       (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field)                                          \
+       (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {                                 \
+               u32 dwval = le32_to_cpu(dw);                            \
+               dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+               dwval |= (((val) & KDETH_##field##_MASK) << \
+                         KDETH_##field##_SHIFT);                       \
+               dw = cpu_to_le32(dwval);                                \
+       } while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                        \
+       do {                                                            \
+               if ((idx) < ARRAY_SIZE((arr)))                          \
+                       (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
+                               (__force u16)(value), (dw), (bit),      \
+                                                       (width));       \
+               else                                                    \
+                       return -ERANGE;                                 \
+       } while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
+
+#define SDMA_REQ_IN_USE     0
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE  2
+#define SDMA_REQ_HAVE_AHG   3
+#define SDMA_REQ_HAS_ERROR  4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE BIT(0)
+#define SDMA_PKT_Q_ACTIVE   BIT(1)
+#define SDMA_PKT_Q_DEFERRED BIT(2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct sdma_mmu_node;
+
+struct user_sdma_iovec {
+       struct list_head list;
+       struct iovec iov;
+       /* number of pages in this vector */
+       unsigned npages;
+       /* array of pinned pages for this vector */
+       struct page **pages;
+       /*
+        * offset into the virtual address space of the vector at
+        * which we last left off.
+        */
+       u64 offset;
+       struct sdma_mmu_node *node;
+};
+
+#define SDMA_CACHE_NODE_EVICT BIT(0)
+
+struct sdma_mmu_node {
+       struct mmu_rb_node rb;
+       struct list_head list;
+       struct hfi1_user_sdma_pkt_q *pq;
+       atomic_t refcount;
+       struct page **pages;
+       unsigned npages;
+       unsigned long flags;
+};
+
+struct user_sdma_request {
+       struct sdma_req_info info;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       /* This is the original header from user space */
+       struct hfi1_pkt_header hdr;
+       /*
+        * Pointer to the SDMA engine for this request.
+        * Since different request could be on different VLs,
+        * each request will need it's own engine pointer.
+        */
+       struct sdma_engine *sde;
+       u8 ahg_idx;
+       u32 ahg[9];
+       /*
+        * KDETH.Offset (Eager) field
+        * We need to remember the initial value so the headers
+        * can be updated properly.
+        */
+       u32 koffset;
+       /*
+        * KDETH.OFFSET (TID) field
+        * The offset can cover multiple packets, depending on the
+        * size of the TID entry.
+        */
+       u32 tidoffset;
+       /*
+        * KDETH.OM
+        * Remember this because the header template always sets it
+        * to 0.
+        */
+       u8 omfactor;
+       /*
+        * We copy the iovs for this request (based on
+        * info.iovcnt). These are only the data vectors
+        */
+       unsigned data_iovs;
+       /* total length of the data in the request */
+       u32 data_len;
+       /* progress index moving along the iovs array */
+       unsigned iov_idx;
+       struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+       /* number of elements copied to the tids array */
+       u16 n_tids;
+       /* TID array values copied from the tid_iov vector */
+       u32 *tids;
+       u16 tididx;
+       u32 sent;
+       u64 seqnum;
+       u64 seqcomp;
+       u64 seqsubmitted;
+       struct list_head txps;
+       unsigned long flags;
+       /* status of the last txreq completed */
+       int status;
+};
+
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
+struct user_sdma_txreq {
+       /* Packet header for the txreq */
+       struct hfi1_pkt_header hdr;
+       struct sdma_txreq txreq;
+       struct list_head list;
+       struct user_sdma_request *req;
+       u16 flags;
+       unsigned busycount;
+       u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...)                                     \
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+                (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+                ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...)                        \
+       hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+                (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int);
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
+static void user_sdma_free_request(struct user_sdma_request *, bool);
+static int pin_vector_pages(struct user_sdma_request *,
+                           struct user_sdma_iovec *);
+static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
+                              unsigned);
+static int check_header_template(struct user_sdma_request *,
+                                struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+                           struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+                               struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
+                                 struct hfi1_user_sdma_comp_q *,
+                                 u16, enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+       struct sdma_engine *,
+       struct iowait *,
+       struct sdma_txreq *,
+       unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
+static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
+static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
+                          struct mm_struct *);
+static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+
+static struct mmu_rb_ops sdma_rb_ops = {
+       .filter = sdma_rb_filter,
+       .insert = sdma_rb_insert,
+       .remove = sdma_rb_remove,
+       .invalidate = sdma_rb_invalidate
+};
+
+static int defer_packet_queue(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *txreq,
+       unsigned seq)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+
+       if (sdma_progress(sde, seq, txreq)) {
+               if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+                       goto eagain;
+       }
+       /*
+        * We are assuming that if the list is enqueued somewhere, it
+        * is to the dmawait list since that is the only place where
+        * it is supposed to be enqueued.
+        */
+       xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+       write_seqlock(&dev->iowait_lock);
+       if (list_empty(&pq->busy.list))
+               list_add_tail(&pq->busy.list, &sde->dmawait);
+       write_sequnlock(&dev->iowait_lock);
+       return -EBUSY;
+eagain:
+       return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+       wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+       struct user_sdma_txreq *tx = obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+       struct hfi1_filedata *fd;
+       int ret = 0;
+       unsigned memsize;
+       char buf[64];
+       struct hfi1_devdata *dd;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       if (!uctxt || !fp) {
+               ret = -EBADF;
+               goto done;
+       }
+
+       fd = fp->private_data;
+
+       if (!hfi1_sdma_comp_ring_size) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       dd = uctxt->dd;
+
+       pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+       if (!pq)
+               goto pq_nomem;
+
+       memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+       pq->reqs = kzalloc(memsize, GFP_KERNEL);
+       if (!pq->reqs)
+               goto pq_reqs_nomem;
+
+       INIT_LIST_HEAD(&pq->list);
+       pq->dd = dd;
+       pq->ctxt = uctxt->ctxt;
+       pq->subctxt = fd->subctxt;
+       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+       pq->state = SDMA_PKT_Q_INACTIVE;
+       atomic_set(&pq->n_reqs, 0);
+       init_waitqueue_head(&pq->wait);
+       pq->sdma_rb_root = RB_ROOT;
+       INIT_LIST_HEAD(&pq->evict);
+       spin_lock_init(&pq->evict_lock);
+
+       iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+                   activate_packet_queue, NULL);
+       pq->reqidx = 0;
+       snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+                fd->subctxt);
+       pq->txreq_cache = kmem_cache_create(buf,
+                              sizeof(struct user_sdma_txreq),
+                                           L1_CACHE_BYTES,
+                                           SLAB_HWCACHE_ALIGN,
+                                           sdma_kmem_cache_ctor);
+       if (!pq->txreq_cache) {
+               dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+                          uctxt->ctxt);
+               goto pq_txreq_nomem;
+       }
+       fd->pq = pq;
+       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq)
+               goto cq_nomem;
+
+       memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
+       cq->comps = vmalloc_user(memsize);
+       if (!cq->comps)
+               goto cq_comps_nomem;
+
+       cq->nentries = hfi1_sdma_comp_ring_size;
+       fd->cq = cq;
+
+       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
+       if (ret) {
+               dd_dev_err(dd, "Failed to register with MMU %d", ret);
+               goto done;
+       }
+
+       spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+       list_add(&pq->list, &uctxt->sdma_queues);
+       spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+       goto done;
+
+cq_comps_nomem:
+       kfree(cq);
+cq_nomem:
+       kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+       kfree(pq->reqs);
+pq_reqs_nomem:
+       kfree(pq);
+       fd->pq = NULL;
+pq_nomem:
+       ret = -ENOMEM;
+done:
+       return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+                 uctxt->ctxt, fd->subctxt);
+       pq = fd->pq;
+       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
+       if (pq) {
+               spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+               if (!list_empty(&pq->list))
+                       list_del_init(&pq->list);
+               spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+               iowait_sdma_drain(&pq->busy);
+               /* Wait until all requests have been freed. */
+               wait_event_interruptible(
+                       pq->wait,
+                       (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+               kfree(pq->reqs);
+               kmem_cache_destroy(pq->txreq_cache);
+               kfree(pq);
+               fd->pq = NULL;
+       }
+       if (fd->cq) {
+               vfree(fd->cq->comps);
+               kfree(fd->cq);
+               fd->cq = NULL;
+       }
+       return 0;
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+                                  unsigned long dim, unsigned long *count)
+{
+       int ret = 0, i = 0;
+       struct hfi1_filedata *fd = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+       struct hfi1_user_sdma_comp_q *cq = fd->cq;
+       struct hfi1_devdata *dd = pq->dd;
+       unsigned long idx = 0;
+       u8 pcount = initial_pkt_count;
+       struct sdma_req_info info;
+       struct user_sdma_request *req;
+       u8 opcode, sc, vl;
+       int req_queued = 0;
+
+       if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+               hfi1_cdbg(
+                  SDMA,
+                  "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+                  dd->unit, uctxt->ctxt, fd->subctxt,
+                  iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+               return -EINVAL;
+       }
+       ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+       if (ret) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+                         dd->unit, uctxt->ctxt, fd->subctxt, ret);
+               return -EFAULT;
+       }
+
+       trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
+                                    (u16 *)&info);
+       if (cq->comps[info.comp_idx].status == QUEUED ||
+           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
+                         dd->unit, uctxt->ctxt, fd->subctxt,
+                         info.comp_idx);
+               return -EBADSLT;
+       }
+       if (!info.fragsize) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Request does not specify fragsize",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+               return -EINVAL;
+       }
+       /*
+        * We've done all the safety checks that we can up to this point,
+        * "allocate" the request entry.
+        */
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+                 uctxt->ctxt, fd->subctxt, info.comp_idx);
+       req = pq->reqs + info.comp_idx;
+       memset(req, 0, sizeof(*req));
+       /* Mark the request as IN_USE before we start filling it in. */
+       set_bit(SDMA_REQ_IN_USE, &req->flags);
+       req->data_iovs = req_iovcnt(info.ctrl) - 1;
+       req->pq = pq;
+       req->cq = cq;
+       req->status = -1;
+       INIT_LIST_HEAD(&req->txps);
+
+       memcpy(&req->info, &info, sizeof(info));
+
+       if (req_opcode(info.ctrl) == EXPECTED)
+               req->data_iovs--;
+
+       if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+               SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+                        MAX_VECTORS_PER_REQ);
+               return -EINVAL;
+       }
+       /* Copy the header from the user buffer */
+       ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+                            sizeof(req->hdr));
+       if (ret) {
+               SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+               ret = -EFAULT;
+               goto free_req;
+       }
+
+       /* If Static rate control is not enabled, sanitize the header. */
+       if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+               req->hdr.pbc[2] = 0;
+
+       /* Validate the opcode. Do not trust packets from user space blindly. */
+       opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+       if ((opcode & USER_OPCODE_CHECK_MASK) !=
+            USER_OPCODE_CHECK_VAL) {
+               SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+               ret = -EINVAL;
+               goto free_req;
+       }
+       /*
+        * Validate the vl. Do not trust packets from user space blindly.
+        * VL comes from PBC, SC comes from LRH, and the VL needs to
+        * match the SC look up.
+        */
+       vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+       sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+             (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+       if (vl >= dd->pport->vls_operational ||
+           vl != sc_to_vlt(dd, sc)) {
+               SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       /* Checking P_KEY for requests from user-space */
+       if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+                             PKEY_CHECK_INVALID)) {
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       /*
+        * Also should check the BTH.lnh. If it says the next header is GRH then
+        * the RXE parsing will be off and will land in the middle of the KDETH
+        * or miss it entirely.
+        */
+       if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+               SDMA_DBG(req, "User tried to pass in a GRH");
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+       /*
+        * Calculate the initial TID offset based on the values of
+        * KDETH.OFFSET and KDETH.OM that are passed in.
+        */
+       req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                KDETH_OM_LARGE : KDETH_OM_SMALL);
+       SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+       idx++;
+
+       /* Save all the IO vector structures */
+       while (i < req->data_iovs) {
+               INIT_LIST_HEAD(&req->iovs[i].list);
+               memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+               ret = pin_vector_pages(req, &req->iovs[i]);
+               if (ret) {
+                       req->status = ret;
+                       goto free_req;
+               }
+               req->data_len += req->iovs[i++].iov.iov_len;
+       }
+       SDMA_DBG(req, "total data length %u", req->data_len);
+
+       if (pcount > req->info.npkts)
+               pcount = req->info.npkts;
+       /*
+        * Copy any TID info
+        * User space will provide the TID info only when the
+        * request type is EXPECTED. This is true even if there is
+        * only one packet in the request and the header is already
+        * setup. The reason for the singular TID case is that the
+        * driver needs to perform safety checks.
+        */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+               if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+                       ret = -EINVAL;
+                       goto free_req;
+               }
+               req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+               if (!req->tids) {
+                       ret = -ENOMEM;
+                       goto free_req;
+               }
+               /*
+                * We have to copy all of the tids because they may vary
+                * in size and, therefore, the TID count might not be
+                * equal to the pkt count. However, there is no way to
+                * tell at this point.
+                */
+               ret = copy_from_user(req->tids, iovec[idx].iov_base,
+                                    ntids * sizeof(*req->tids));
+               if (ret) {
+                       SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+                                ntids, ret);
+                       ret = -EFAULT;
+                       goto free_req;
+               }
+               req->n_tids = ntids;
+               idx++;
+       }
+
+       /* Have to select the engine */
+       req->sde = sdma_select_engine_vl(dd,
+                                        (u32)(uctxt->ctxt + fd->subctxt),
+                                        vl);
+       if (!req->sde || !sdma_running(req->sde)) {
+               ret = -ECOMM;
+               goto free_req;
+       }
+
+       /* We don't need an AHG entry if the request contains only one packet */
+       if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+               int ahg = sdma_ahg_alloc(req->sde);
+
+               if (likely(ahg >= 0)) {
+                       req->ahg_idx = (u8)ahg;
+                       set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+               }
+       }
+
+       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
+       atomic_inc(&pq->n_reqs);
+       req_queued = 1;
+       /* Send the first N packets in the request to buy us some time */
+       ret = user_sdma_send_pkts(req, pcount);
+       if (unlikely(ret < 0 && ret != -EBUSY)) {
+               req->status = ret;
+               goto free_req;
+       }
+
+       /*
+        * It is possible that the SDMA engine would have processed all the
+        * submitted packets by the time we get here. Therefore, only set
+        * packet queue state to ACTIVE if there are still uncompleted
+        * requests.
+        */
+       if (atomic_read(&pq->n_reqs))
+               xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+
+       /*
+        * This is a somewhat blocking send implementation.
+        * The driver will block the caller until all packets of the
+        * request have been submitted to the SDMA engine. However, it
+        * will not wait for send completions.
+        */
+       while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+               ret = user_sdma_send_pkts(req, pcount);
+               if (ret < 0) {
+                       if (ret != -EBUSY) {
+                               req->status = ret;
+                               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                               if (ACCESS_ONCE(req->seqcomp) ==
+                                   req->seqsubmitted - 1)
+                                       goto free_req;
+                               return ret;
+                       }
+                       wait_event_interruptible_timeout(
+                               pq->busy.wait_dma,
+                               (pq->state == SDMA_PKT_Q_ACTIVE),
+                               msecs_to_jiffies(
+                                       SDMA_IOWAIT_TIMEOUT));
+               }
+       }
+       *count += idx;
+       return 0;
+free_req:
+       user_sdma_free_request(req, true);
+       if (req_queued)
+               pq_update(pq);
+       set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+       return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+                                     struct user_sdma_txreq *tx)
+{
+       /*
+        * Determine the proper size of the packet data.
+        * The size of the data of the first packet is in the header
+        * template. However, it includes the header and ICRC, which need
+        * to be subtracted.
+        * The size of the remaining packets is the minimum of the frag
+        * size (MTU) or remaining data in the request.
+        */
+       u32 len;
+
+       if (!req->seqnum) {
+               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+                      (sizeof(tx->hdr) - 4));
+       } else if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+                       PAGE_SIZE;
+               /*
+                * Get the data length based on the remaining space in the
+                * TID pair.
+                */
+               len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+               /* If we've filled up the TID pair, move to the next one. */
+               if (unlikely(!len) && ++req->tididx < req->n_tids &&
+                   req->tids[req->tididx]) {
+                       tidlen = EXP_TID_GET(req->tids[req->tididx],
+                                            LEN) * PAGE_SIZE;
+                       req->tidoffset = 0;
+                       len = min_t(u32, tidlen, req->info.fragsize);
+               }
+               /*
+                * Since the TID pairs map entire pages, make sure that we
+                * are not going to try to send more data that we have
+                * remaining.
+                */
+               len = min(len, req->data_len - req->sent);
+       } else {
+               len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+       }
+       SDMA_DBG(req, "Data Length = %u", len);
+       return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+       /* (Size of complete header - size of PBC) + 4B ICRC + data length */
+       return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+       int ret = 0;
+       unsigned npkts = 0;
+       struct user_sdma_txreq *tx = NULL;
+       struct hfi1_user_sdma_pkt_q *pq = NULL;
+       struct user_sdma_iovec *iovec = NULL;
+
+       if (!req->pq)
+               return -EINVAL;
+
+       pq = req->pq;
+
+       /* If tx completion has reported an error, we are done. */
+       if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+               return -EFAULT;
+       }
+
+       /*
+        * Check if we might have sent the entire request already
+        */
+       if (unlikely(req->seqnum == req->info.npkts)) {
+               if (!list_empty(&req->txps))
+                       goto dosend;
+               return ret;
+       }
+
+       if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+               maxpkts = req->info.npkts - req->seqnum;
+
+       while (npkts < maxpkts) {
+               u32 datalen = 0, queued = 0, data_sent = 0;
+               u64 iov_offset = 0;
+
+               /*
+                * Check whether any of the completions have come back
+                * with errors. If so, we are not going to process any
+                * more packets from this request.
+                */
+               if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+                       set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                       return -EFAULT;
+               }
+
+               tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+               if (!tx)
+                       return -ENOMEM;
+
+               tx->flags = 0;
+               tx->req = req;
+               tx->busycount = 0;
+               INIT_LIST_HEAD(&tx->list);
+
+               if (req->seqnum == req->info.npkts - 1)
+                       tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
+
+               /*
+                * Calculate the payload size - this is min of the fragment
+                * (MTU) size or the remaining bytes in the request but only
+                * if we have payload data.
+                */
+               if (req->data_len) {
+                       iovec = &req->iovs[req->iov_idx];
+                       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+                               if (++req->iov_idx == req->data_iovs) {
+                                       ret = -EFAULT;
+                                       goto free_txreq;
+                               }
+                               iovec = &req->iovs[req->iov_idx];
+                               WARN_ON(iovec->offset);
+                       }
+
+                       datalen = compute_data_length(req, tx);
+                       if (!datalen) {
+                               SDMA_DBG(req,
+                                        "Request has data but pkt len is 0");
+                               ret = -EFAULT;
+                               goto free_tx;
+                       }
+               }
+
+               if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+                       if (!req->seqnum) {
+                               u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
+                               /*
+                                * Copy the request header into the tx header
+                                * because the HW needs a cacheline-aligned
+                                * address.
+                                * This copy can be optimized out if the hdr
+                                * member of user_sdma_request were also
+                                * cacheline aligned.
+                                */
+                               memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+                               if (PBC2LRH(pbclen) != lrhlen) {
+                                       pbclen = (pbclen & 0xf000) |
+                                               LRH2PBC(lrhlen);
+                                       tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+                               }
+                               ret = sdma_txinit_ahg(&tx->txreq,
+                                                     SDMA_TXREQ_F_AHG_COPY,
+                                                     sizeof(tx->hdr) + datalen,
+                                                     req->ahg_idx, 0, NULL, 0,
+                                                     user_sdma_txreq_cb);
+                               if (ret)
+                                       goto free_tx;
+                               ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+                                                       &tx->hdr,
+                                                       sizeof(tx->hdr));
+                               if (ret)
+                                       goto free_txreq;
+                       } else {
+                               int changes;
+
+                               changes = set_txreq_header_ahg(req, tx,
+                                                              datalen);
+                               if (changes < 0)
+                                       goto free_tx;
+                               sdma_txinit_ahg(&tx->txreq,
+                                               SDMA_TXREQ_F_USE_AHG,
+                                               datalen, req->ahg_idx, changes,
+                                               req->ahg, sizeof(req->hdr),
+                                               user_sdma_txreq_cb);
+                       }
+               } else {
+                       ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+                                         datalen, user_sdma_txreq_cb);
+                       if (ret)
+                               goto free_tx;
+                       /*
+                        * Modify the header for this packet. This only needs
+                        * to be done if we are not going to use AHG. Otherwise,
+                        * the HW will do it based on the changes we gave it
+                        * during sdma_txinit_ahg().
+                        */
+                       ret = set_txreq_header(req, tx, datalen);
+                       if (ret)
+                               goto free_txreq;
+               }
+
+               /*
+                * If the request contains any data vectors, add up to
+                * fragsize bytes to the descriptor.
+                */
+               while (queued < datalen &&
+                      (req->sent + data_sent) < req->data_len) {
+                       unsigned long base, offset;
+                       unsigned pageidx, len;
+
+                       base = (unsigned long)iovec->iov.iov_base;
+                       offset = offset_in_page(base + iovec->offset +
+                                               iov_offset);
+                       pageidx = (((iovec->offset + iov_offset +
+                                    base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+                       len = offset + req->info.fragsize > PAGE_SIZE ?
+                               PAGE_SIZE - offset : req->info.fragsize;
+                       len = min((datalen - queued), len);
+                       ret = sdma_txadd_page(pq->dd, &tx->txreq,
+                                             iovec->pages[pageidx],
+                                             offset, len);
+                       if (ret) {
+                               SDMA_DBG(req, "SDMA txreq add page failed %d\n",
+                                        ret);
+                               goto free_txreq;
+                       }
+                       iov_offset += len;
+                       queued += len;
+                       data_sent += len;
+                       if (unlikely(queued < datalen &&
+                                    pageidx == iovec->npages &&
+                                    req->iov_idx < req->data_iovs - 1)) {
+                               iovec->offset += iov_offset;
+                               iovec = &req->iovs[++req->iov_idx];
+                               iov_offset = 0;
+                       }
+               }
+               /*
+                * The txreq was submitted successfully so we can update
+                * the counters.
+                */
+               req->koffset += datalen;
+               if (req_opcode(req->info.ctrl) == EXPECTED)
+                       req->tidoffset += datalen;
+               req->sent += data_sent;
+               if (req->data_len)
+                       iovec->offset += iov_offset;
+               list_add_tail(&tx->txreq.list, &req->txps);
+               /*
+                * It is important to increment this here as it is used to
+                * generate the BTH.PSN and, therefore, can't be bulk-updated
+                * outside of the loop.
+                */
+               tx->seqnum = req->seqnum++;
+               npkts++;
+       }
+dosend:
+       ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+       if (list_empty(&req->txps)) {
+               req->seqsubmitted = req->seqnum;
+               if (req->seqnum == req->info.npkts) {
+                       set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+                       /*
+                        * The txreq has already been submitted to the HW queue
+                        * so we can free the AHG entry now. Corruption will not
+                        * happen due to the sequential manner in which
+                        * descriptors are processed.
+                        */
+                       if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+                               sdma_ahg_free(req->sde, req->ahg_idx);
+               }
+       } else if (ret > 0) {
+               req->seqsubmitted += ret;
+               ret = 0;
+       }
+       return ret;
+
+free_txreq:
+       sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+       kmem_cache_free(pq->txreq_cache, tx);
+       return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+       const unsigned long addr  = (unsigned long)iov->iov_base;
+       const unsigned long len   = iov->iov_len;
+       const unsigned long spage = addr & PAGE_MASK;
+       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+       return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+{
+       u32 cleared = 0;
+       struct sdma_mmu_node *node, *ptr;
+       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
+
+       spin_lock(&pq->evict_lock);
+       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
+               /* Make sure that no one is still using the node. */
+               if (!atomic_read(&node->refcount)) {
+                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
+                       list_del_init(&node->list);
+                       list_add(&node->list, &to_evict);
+                       cleared += node->npages;
+                       if (cleared >= npages)
+                               break;
+               }
+       }
+       spin_unlock(&pq->evict_lock);
+
+       list_for_each_entry_safe(node, ptr, &to_evict, list)
+               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+
+       return cleared;
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+                           struct user_sdma_iovec *iovec) {
+       int ret = 0, pinned, npages, cleared;
+       struct page **pages;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct sdma_mmu_node *node = NULL;
+       struct mmu_rb_node *rb_node;
+
+       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+                                     (unsigned long)iovec->iov.iov_base,
+                                     iovec->iov.iov_len);
+       if (rb_node && !IS_ERR(rb_node))
+               node = container_of(rb_node, struct sdma_mmu_node, rb);
+       else
+               rb_node = NULL;
+
+       if (!node) {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (!node)
+                       return -ENOMEM;
+
+               node->rb.addr = (unsigned long)iovec->iov.iov_base;
+               node->pq = pq;
+               atomic_set(&node->refcount, 0);
+               INIT_LIST_HEAD(&node->list);
+       }
+
+       npages = num_user_pages(&iovec->iov);
+       if (node->npages < npages) {
+               pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       SDMA_DBG(req, "Failed page array alloc");
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+               memcpy(pages, node->pages, node->npages * sizeof(*pages));
+
+               npages -= node->npages;
+
+               /*
+                * If rb_node is NULL, it means that this is brand new node
+                * and, therefore not on the eviction list.
+                * If, however, the rb_node is non-NULL, it means that the
+                * node is already in RB tree and, therefore on the eviction
+                * list (nodes are unconditionally inserted in the eviction
+                * list). In that case, we have to remove the node prior to
+                * calling the eviction function in order to prevent it from
+                * freeing this node.
+                */
+               if (rb_node) {
+                       spin_lock(&pq->evict_lock);
+                       list_del_init(&node->list);
+                       spin_unlock(&pq->evict_lock);
+               }
+retry:
+               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
+                       cleared = sdma_cache_evict(pq, npages);
+                       if (cleared >= npages)
+                               goto retry;
+               }
+               pinned = hfi1_acquire_user_pages(
+                       ((unsigned long)iovec->iov.iov_base +
+                        (node->npages * PAGE_SIZE)), npages, 0,
+                       pages + node->npages);
+               if (pinned < 0) {
+                       kfree(pages);
+                       ret = pinned;
+                       goto bail;
+               }
+               if (pinned != npages) {
+                       unpin_vector_pages(current->mm, pages, node->npages,
+                                          pinned);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               kfree(node->pages);
+               node->rb.len = iovec->iov.iov_len;
+               node->pages = pages;
+               node->npages += pinned;
+               npages = node->npages;
+               spin_lock(&pq->evict_lock);
+               list_add(&node->list, &pq->evict);
+               pq->n_locked += pinned;
+               spin_unlock(&pq->evict_lock);
+       }
+       iovec->pages = node->pages;
+       iovec->npages = npages;
+       iovec->node = node;
+
+       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+       if (ret) {
+               spin_lock(&pq->evict_lock);
+               if (!list_empty(&node->list))
+                       list_del(&node->list);
+               pq->n_locked -= node->npages;
+               spin_unlock(&pq->evict_lock);
+               goto bail;
+       }
+       return 0;
+bail:
+       if (rb_node)
+               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+       kfree(node);
+       return ret;
+}
+
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+                              unsigned start, unsigned npages)
+{
+       hfi1_release_user_pages(mm, pages + start, npages, 0);
+       kfree(pages);
+}
+
+static int check_header_template(struct user_sdma_request *req,
+                                struct hfi1_pkt_header *hdr, u32 lrhlen,
+                                u32 datalen)
+{
+       /*
+        * Perform safety checks for any type of packet:
+        *    - transfer size is multiple of 64bytes
+        *    - packet length is multiple of 4bytes
+        *    - entire request length is multiple of 4bytes
+        *    - packet length is not larger than MTU size
+        *
+        * These checks are only done for the first packet of the
+        * transfer since the header is "given" to us by user space.
+        * For the remainder of the packets we compute the values.
+        */
+       if (req->info.fragsize % PIO_BLOCK_SIZE ||
+           lrhlen & 0x3 || req->data_len & 0x3  ||
+           lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+               return -EINVAL;
+
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               /*
+                * The header is checked only on the first packet. Furthermore,
+                * we ensure that at least one TID entry is copied when the
+                * request is submitted. Therefore, we don't have to verify that
+                * tididx points to something sane.
+                */
+               u32 tidval = req->tids[req->tididx],
+                       tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+                       tididx = EXP_TID_GET(tidval, IDX),
+                       tidctrl = EXP_TID_GET(tidval, CTRL),
+                       tidoff;
+               __le32 kval = hdr->kdeth.ver_tid_offset;
+
+               tidoff = KDETH_GET(kval, OFFSET) *
+                         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                          KDETH_OM_LARGE : KDETH_OM_SMALL);
+               /*
+                * Expected receive packets have the following
+                * additional checks:
+                *     - offset is not larger than the TID size
+                *     - TIDCtrl values match between header and TID array
+                *     - TID indexes match between header and TID array
+                */
+               if ((tidoff + datalen > tidlen) ||
+                   KDETH_GET(kval, TIDCTRL) != tidctrl ||
+                   KDETH_GET(kval, TID) != tididx)
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+       u32 val = be32_to_cpu(bthpsn),
+               mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+                       0xffffffull),
+               psn = val & mask;
+       if (expct)
+               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+       else
+               psn = psn + frags;
+       return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+                           struct user_sdma_txreq *tx, u32 datalen)
+{
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &tx->hdr;
+       u16 pbclen;
+       int ret;
+       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+
+       /* Copy the header template to the request before modification */
+       memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+       /*
+        * Check if the PBC and LRH length are mismatched. If so
+        * adjust both in the header.
+        */
+       pbclen = le16_to_cpu(hdr->pbc[0]);
+       if (PBC2LRH(pbclen) != lrhlen) {
+               pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+               hdr->pbc[0] = cpu_to_le16(pbclen);
+               hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+               /*
+                * Third packet
+                * This is the first packet in the sequence that has
+                * a "static" size that can be used for the rest of
+                * the packets (besides the last one).
+                */
+               if (unlikely(req->seqnum == 2)) {
+                       /*
+                        * From this point on the lengths in both the
+                        * PBC and LRH are the same until the last
+                        * packet.
+                        * Adjust the template so we don't have to update
+                        * every packet
+                        */
+                       req->hdr.pbc[0] = hdr->pbc[0];
+                       req->hdr.lrh[2] = hdr->lrh[2];
+               }
+       }
+       /*
+        * We only have to modify the header if this is not the
+        * first packet in the request. Otherwise, we use the
+        * header given to us.
+        */
+       if (unlikely(!req->seqnum)) {
+               ret = check_header_template(req, hdr, lrhlen, datalen);
+               if (ret)
+                       return ret;
+               goto done;
+       }
+
+       hdr->bth[2] = cpu_to_be32(
+               set_pkt_bth_psn(hdr->bth[2],
+                               (req_opcode(req->info.ctrl) == EXPECTED),
+                               req->seqnum));
+
+       /* Set ACK request on last packet */
+       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+               hdr->bth[2] |= cpu_to_be32(1UL << 31);
+
+       /* Set the new offset */
+       hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+       /* Expected packets have to fill in the new TID information */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               tidval = req->tids[req->tididx];
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+                       KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+               /* Set KDETH.TIDCtrl based on value for this TID. */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+                         EXP_TID_GET(tidval, CTRL));
+               /* Set KDETH.TID based on value for this TID */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+                         EXP_TID_GET(tidval, IDX));
+               /* Clear KDETH.SH only on the last packet */
+               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+                       KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+               /*
+                * Set the KDETH.OFFSET and KDETH.OM based on size of
+                * transfer.
+                */
+               SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+                        req->tidoffset, req->tidoffset / req->omfactor,
+                        !!(req->omfactor - KDETH_OM_SMALL));
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+                         req->tidoffset / req->omfactor);
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+                         !!(req->omfactor - KDETH_OM_SMALL));
+       }
+done:
+       trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+                                   req->info.comp_idx, hdr, tidval);
+       return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+                               struct user_sdma_txreq *tx, u32 len)
+{
+       int diff = 0;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &req->hdr;
+       u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+
+       if (PBC2LRH(pbclen) != lrhlen) {
+               /* PBC.PbcLengthDWs */
+               AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+                              cpu_to_le16(LRH2PBC(lrhlen)));
+               /* LRH.PktLen (we need the full 16 bits due to byte swap) */
+               AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+                              cpu_to_be16(lrhlen >> 2));
+       }
+
+       /*
+        * Do the common updates
+        */
+       /* BTH.PSN and BTH.A */
+       val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+               (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+               val32 |= 1UL << 31;
+       AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+       AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+       /* KDETH.Offset */
+       AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+                      cpu_to_le16(req->koffset & 0xffff));
+       AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+                      cpu_to_le16(req->koffset >> 16));
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               __le16 val;
+
+               tidval = req->tids[req->tididx];
+
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /*
+                        * Since we don't copy all the TIDs, all at once,
+                        * we have to check again.
+                        */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+                                 PAGE_SIZE) >=
+                                KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+                       KDETH_OM_SMALL;
+               /* KDETH.OM and KDETH.OFFSET (TID) */
+               AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+                              ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+                               ((req->tidoffset / req->omfactor) & 0x7fff)));
+               /* KDETH.TIDCtrl, KDETH.TID */
+               val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+                                       (EXP_TID_GET(tidval, IDX) & 0x3ff));
+               /* Clear KDETH.SH on last packet */
+               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
+                       val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+                                                               INTR) >> 16);
+                       val &= cpu_to_le16(~(1U << 13));
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+               } else {
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+               }
+       }
+
+       trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+                                       req->info.comp_idx, req->sde->this_idx,
+                                       req->ahg_idx, req->ahg, diff, tidval);
+       return diff;
+}
+
+/*
+ * SDMA tx request completion callback. Called when the SDMA progress
+ * state machine gets notification that the SDMA descriptors for this
+ * tx request have been processed by the DMA engine. Called in
+ * interrupt context.
+ */
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
+{
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+       struct user_sdma_request *req;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       u16 idx;
+
+       if (!tx->req)
+               return;
+
+       req = tx->req;
+       pq = req->pq;
+       cq = req->cq;
+
+       if (status != SDMA_TXREQ_S_OK) {
+               SDMA_DBG(req, "SDMA completion with error %d",
+                        status);
+               set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+       }
+
+       req->seqcomp = tx->seqnum;
+       kmem_cache_free(pq->txreq_cache, tx);
+       tx = NULL;
+
+       idx = req->info.comp_idx;
+       if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
+               if (req->seqcomp == req->info.npkts - 1) {
+                       req->status = 0;
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, COMPLETE, 0);
+               }
+       } else {
+               if (status != SDMA_TXREQ_S_OK)
+                       req->status = status;
+               if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+                   (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
+                    test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
+                       user_sdma_free_request(req, false);
+                       pq_update(pq);
+                       set_comp_state(pq, cq, idx, ERROR, req->status);
+               }
+       }
+}
+
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
+{
+       if (atomic_dec_and_test(&pq->n_reqs)) {
+               xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+               wake_up(&pq->wait);
+       }
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
+{
+       if (!list_empty(&req->txps)) {
+               struct sdma_txreq *t, *p;
+
+               list_for_each_entry_safe(t, p, &req->txps, list) {
+                       struct user_sdma_txreq *tx =
+                               container_of(t, struct user_sdma_txreq, txreq);
+                       list_del_init(&t->list);
+                       sdma_txclean(req->pq->dd, t);
+                       kmem_cache_free(req->pq->txreq_cache, tx);
+               }
+       }
+       if (req->data_iovs) {
+               struct sdma_mmu_node *node;
+               int i;
+
+               for (i = 0; i < req->data_iovs; i++) {
+                       node = req->iovs[i].node;
+                       if (!node)
+                               continue;
+
+                       if (unpin)
+                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
+                                                  &node->rb);
+                       else
+                               atomic_dec(&node->refcount);
+               }
+       }
+       kfree(req->tids);
+       clear_bit(SDMA_REQ_IN_USE, &req->flags);
+}
+
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+                                 struct hfi1_user_sdma_comp_q *cq,
+                                 u16 idx, enum hfi1_sdma_comp_state state,
+                                 int ret)
+{
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
+                 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
+       cq->comps[idx].status = state;
+       if (state == ERROR)
+               cq->comps[idx].errcode = -ret;
+       trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
+                                       idx, state, ret);
+}
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+                          unsigned long len)
+{
+       return (bool)(node->addr == addr);
+}
+
+static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       atomic_inc(&node->refcount);
+       return 0;
+}
+
+static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
+                          struct mm_struct *mm)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       spin_lock(&node->pq->evict_lock);
+       /*
+        * We've been called by the MMU notifier but this node has been
+        * scheduled for eviction. The eviction function will take care
+        * of freeing this node.
+        * We have to take the above lock first because we are racing
+        * against the setting of the bit in the eviction function.
+        */
+       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
+               spin_unlock(&node->pq->evict_lock);
+               return;
+       }
+
+       if (!list_empty(&node->list))
+               list_del(&node->list);
+       node->pq->n_locked -= node->npages;
+       spin_unlock(&node->pq->evict_lock);
+
+       /*
+        * If mm is set, we are being called by the MMU notifier and we
+        * should not pass a mm_struct to unpin_vector_page(). This is to
+        * prevent a deadlock when hfi1_release_user_pages() attempts to
+        * take the mmap_sem, which the MMU notifier has already taken.
+        */
+       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
+                          node->npages);
+       /*
+        * If called by the MMU notifier, we have to adjust the pinned
+        * page count ourselves.
+        */
+       if (mm)
+               mm->pinned_vm -= node->npages;
+       kfree(node);
+}
+
+static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+
+       if (!atomic_read(&node->refcount))
+               return 1;
+       return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
new file mode 100644 (file)
index 0000000..b9240e3
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+#include "user_exp_rcv.h"
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+       struct list_head list;
+       unsigned ctxt;
+       unsigned subctxt;
+       u16 n_max_reqs;
+       atomic_t n_reqs;
+       u16 reqidx;
+       struct hfi1_devdata *dd;
+       struct kmem_cache *txreq_cache;
+       struct user_sdma_request *reqs;
+       struct iowait busy;
+       unsigned state;
+       wait_queue_head_t wait;
+       unsigned long unpinned;
+       struct rb_root sdma_rb_root;
+       u32 n_locked;
+       struct list_head evict;
+       spinlock_t evict_lock; /* protect evict and n_locked */
+};
+
+struct hfi1_user_sdma_comp_q {
+       u16 nentries;
+       struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+                                  unsigned long *);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
new file mode 100644 (file)
index 0000000..849c4b9
--- /dev/null
@@ -0,0 +1,1764 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+
+static unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+                  S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+                "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+                "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+                "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+                "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+                  uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+                "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+unsigned short piothreshold = 256;
+module_param(piothreshold, ushort, S_IRUGO);
+MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
+
+#define COPY_CACHELESS 1
+#define COPY_ADAPTIVE  2
+static unsigned int sge_copy_mode;
+module_param(sge_copy_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(sge_copy_mode,
+                "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
+
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status);
+
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag);
+
+/* Length of buffer to create verbs txreq cache name */
+#define TXREQ_NAME_LEN 24
+
+static uint wss_threshold;
+module_param(wss_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
+static uint wss_clean_period = 256;
+module_param(wss_clean_period, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
+
+/* memory working set size */
+struct hfi1_wss {
+       unsigned long *entries;
+       atomic_t total_count;
+       atomic_t clean_counter;
+       atomic_t clean_entry;
+
+       int threshold;
+       int num_entries;
+       long pages_mask;
+};
+
+static struct hfi1_wss wss;
+
+int hfi1_wss_init(void)
+{
+       long llc_size;
+       long llc_bits;
+       long table_size;
+       long table_bits;
+
+       /* check for a valid percent range - default to 80 if none or invalid */
+       if (wss_threshold < 1 || wss_threshold > 100)
+               wss_threshold = 80;
+       /* reject a wildly large period */
+       if (wss_clean_period > 1000000)
+               wss_clean_period = 256;
+       /* reject a zero period */
+       if (wss_clean_period == 0)
+               wss_clean_period = 1;
+
+       /*
+        * Calculate the table size - the next power of 2 larger than the
+        * LLC size.  LLC size is in KiB.
+        */
+       llc_size = wss_llc_size() * 1024;
+       table_size = roundup_pow_of_two(llc_size);
+
+       /* one bit per page in rounded up table */
+       llc_bits = llc_size / PAGE_SIZE;
+       table_bits = table_size / PAGE_SIZE;
+       wss.pages_mask = table_bits - 1;
+       wss.num_entries = table_bits / BITS_PER_LONG;
+
+       wss.threshold = (llc_bits * wss_threshold) / 100;
+       if (wss.threshold == 0)
+               wss.threshold = 1;
+
+       atomic_set(&wss.clean_counter, wss_clean_period);
+
+       wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
+                             GFP_KERNEL);
+       if (!wss.entries) {
+               hfi1_wss_exit();
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void hfi1_wss_exit(void)
+{
+       /* coded to handle partially initialized and repeat callers */
+       kfree(wss.entries);
+       wss.entries = NULL;
+}
+
+/*
+ * Advance the clean counter.  When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking.  Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information.  Since this is only a heuristic, this is
+ * OK.  Any innaccuracies will clean themselves out as the counter
+ * advances.  That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero.  When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(void)
+{
+       int entry;
+       int weight;
+       unsigned long bits;
+
+       /* become the cleaner if we decrement the counter to zero */
+       if (atomic_dec_and_test(&wss.clean_counter)) {
+               /*
+                * Set, not add, the clean period.  This avoids an issue
+                * where the counter could decrement below the clean period.
+                * Doing a set can result in lost decrements, slowing the
+                * clean advance.  Since this a heuristic, this possible
+                * slowdown is OK.
+                *
+                * An alternative is to loop, advancing the counter by a
+                * clean period until the result is > 0. However, this could
+                * lead to several threads keeping another in the clean loop.
+                * This could be mitigated by limiting the number of times
+                * we stay in the loop.
+                */
+               atomic_set(&wss.clean_counter, wss_clean_period);
+
+               /*
+                * Uniquely grab the entry to clean and move to next.
+                * The current entry is always the lower bits of
+                * wss.clean_entry.  The table size, wss.num_entries,
+                * is always a power-of-2.
+                */
+               entry = (atomic_inc_return(&wss.clean_entry) - 1)
+                       & (wss.num_entries - 1);
+
+               /* clear the entry and count the bits */
+               bits = xchg(&wss.entries[entry], 0);
+               weight = hweight64((u64)bits);
+               /* only adjust the contended total count if needed */
+               if (weight)
+                       atomic_sub(weight, &wss.total_count);
+       }
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(void *address)
+{
+       u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
+       u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+       u32 nr = page & (BITS_PER_LONG - 1);
+
+       if (!test_and_set_bit(nr, &wss.entries[entry]))
+               atomic_inc(&wss.total_count);
+
+       wss_advance_clean_counter();
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline int wss_exceeds_threshold(void)
+{
+       return atomic_read(&wss.total_count) >= wss.threshold;
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+       [IB_WR_SEND] = IB_WC_SEND,
+       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
+       [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
+       /* CNP */
+       [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void hfi1_copy_sge(
+       struct rvt_sge_state *ss,
+       void *data, u32 length,
+       int release,
+       int copy_last)
+{
+       struct rvt_sge *sge = &ss->sge;
+       int in_last = 0;
+       int i;
+       int cacheless_copy = 0;
+
+       if (sge_copy_mode == COPY_CACHELESS) {
+               cacheless_copy = length >= PAGE_SIZE;
+       } else if (sge_copy_mode == COPY_ADAPTIVE) {
+               if (length >= PAGE_SIZE) {
+                       /*
+                        * NOTE: this *assumes*:
+                        * o The first vaddr is the dest.
+                        * o If multiple pages, then vaddr is sequential.
+                        */
+                       wss_insert(sge->vaddr);
+                       if (length >= (2 * PAGE_SIZE))
+                               wss_insert(sge->vaddr + PAGE_SIZE);
+
+                       cacheless_copy = wss_exceeds_threshold();
+               } else {
+                       wss_advance_clean_counter();
+               }
+       }
+       if (copy_last) {
+               if (length > 8) {
+                       length -= 8;
+               } else {
+                       copy_last = 0;
+                       in_last = 1;
+               }
+       }
+
+again:
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               if (unlikely(in_last)) {
+                       /* enforce byte transfer ordering */
+                       for (i = 0; i < len; i++)
+                               ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+               } else if (cacheless_copy) {
+                       cacheless_memcpy(sge->vaddr, data, len);
+               } else {
+                       memcpy(sge->vaddr, data, len);
+               }
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               rvt_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+
+       if (copy_last) {
+               copy_last = 0;
+               in_last = 1;
+               length = 8;
+               goto again;
+       }
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
+{
+       struct rvt_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               rvt_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= RVT_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp;
+
+       if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+               goto dropit;
+       if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+           (opcode == IB_OPCODE_CNP))
+               return 1;
+dropit:
+       ibp = &packet->rcd->ppd->ibport_data;
+       ibp->rvp.n_pkt_drops++;
+       return 0;
+}
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 tlen = packet->tlen;
+       struct hfi1_pportdata *ppd = rcd->ppd;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+       unsigned long flags;
+       u32 qp_num;
+       int lnh;
+       u8 opcode;
+       u16 lid;
+
+       /* Check for GRH */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_BTH) {
+               packet->ohdr = &hdr->u.oth;
+       } else if (lnh == HFI1_LRH_GRH) {
+               u32 vtf;
+
+               packet->ohdr = &hdr->u.l.oth;
+               if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                       goto drop;
+               vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+               if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                       goto drop;
+               packet->rcv_flags |= HFI1_HAS_GRH;
+       } else {
+               goto drop;
+       }
+
+       trace_input_ibhdr(rcd->dd, hdr);
+
+       opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+       inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+       /* Get the destination QP number. */
+       qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
+       lid = be16_to_cpu(hdr->lrh[1]);
+       if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                    (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
+               struct rvt_mcast *mcast;
+               struct rvt_mcast_qp *p;
+
+               if (lnh != HFI1_LRH_GRH)
+                       goto drop;
+               mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
+               if (!mcast)
+                       goto drop;
+               list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+                       packet->qp = p->qp;
+                       spin_lock_irqsave(&packet->qp->r_lock, flags);
+                       if (likely((qp_ok(opcode, packet))))
+                               opcode_handler_tbl[opcode](packet);
+                       spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+               }
+               /*
+                * Notify rvt_multicast_detach() if it is waiting for us
+                * to finish.
+                */
+               if (atomic_dec_return(&mcast->refcount) <= 1)
+                       wake_up(&mcast->wait);
+       } else {
+               rcu_read_lock();
+               packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+               if (!packet->qp) {
+                       rcu_read_unlock();
+                       goto drop;
+               }
+               spin_lock_irqsave(&packet->qp->r_lock, flags);
+               if (likely((qp_ok(opcode, packet))))
+                       opcode_handler_tbl[opcode](packet);
+               spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+               rcu_read_unlock();
+       }
+       return;
+
+drop:
+       ibp->rvp.n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+       struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+       struct list_head *list = &dev->memwait;
+       struct rvt_qp *qp = NULL;
+       struct iowait *wait;
+       unsigned long flags;
+       struct hfi1_qp_priv *priv;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(list)) {
+               wait = list_first_entry(list, struct iowait, list);
+               qp = iowait_to_qp(wait);
+               priv = qp->priv;
+               list_del_init(&priv->s_iowait.list);
+               /* refcount held until actual wake up */
+               if (!list_empty(list))
+                       mod_timer(&dev->mem_timer, jiffies + 1);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       if (qp)
+               hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
+}
+
+void update_sge(struct rvt_sge_state *ss, u32 length)
+{
+       struct rvt_sge *sge = &ss->sge;
+
+       sge->vaddr += length;
+       sge->length -= length;
+       sge->sge_length -= length;
+       if (sge->sge_length == 0) {
+               if (--ss->num_sge)
+                       *sge = *ss->sg_list++;
+       } else if (sge->length == 0 && sge->mr->lkey) {
+               if (++sge->n >= RVT_SEGSZ) {
+                       if (++sge->m >= sge->mr->mapsz)
+                               return;
+                       sge->n = 0;
+               }
+               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+       }
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status)
+{
+       struct verbs_txreq *tx =
+               container_of(cookie, struct verbs_txreq, txreq);
+       struct rvt_qp *qp = tx->qp;
+
+       spin_lock(&qp->s_lock);
+       if (tx->wqe) {
+               hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               struct hfi1_ib_header *hdr;
+
+               hdr = &tx->phdr.hdr;
+               hfi1_rc_send_complete(qp, hdr);
+       }
+       spin_unlock(&qp->s_lock);
+
+       hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev,
+                    struct rvt_qp *qp,
+                    struct hfi1_pkt_state *ps)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               list_add_tail(&ps->s_txreq->txreq.list,
+                             &priv->s_iowait.tx_head);
+               if (list_empty(&priv->s_iowait.list)) {
+                       if (list_empty(&dev->memwait))
+                               mod_timer(&dev->mem_timer, jiffies + 1);
+                       qp->s_flags |= RVT_S_WAIT_KMEM;
+                       list_add_tail(&priv->s_iowait.list, &dev->memwait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static noinline int build_verbs_ulp_payload(
+       struct sdma_engine *sde,
+       struct rvt_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx)
+{
+       struct rvt_sge *sg_list = ss->sg_list;
+       struct rvt_sge sge = ss->sge;
+       u8 num_sge = ss->num_sge;
+       u32 len;
+       int ret = 0;
+
+       while (length) {
+               len = ss->sge.length;
+               if (len > length)
+                       len = length;
+               if (len > ss->sge.sge_length)
+                       len = ss->sge.sge_length;
+               WARN_ON_ONCE(len == 0);
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       ss->sge.vaddr,
+                       len);
+               if (ret)
+                       goto bail_txadd;
+               update_sge(ss, len);
+               length -= len;
+       }
+       return ret;
+bail_txadd:
+       /* unwind cursor */
+       ss->sge = sge;
+       ss->num_sge = num_sge;
+       ss->sg_list = sg_list;
+       return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ *       the tx desc is freed without having been submitted to the ring
+ *
+ * This routine ensures all the helper routine calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+       struct sdma_engine *sde,
+       struct rvt_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx,
+       struct ahg_ib_header *ahdr,
+       u64 pbc)
+{
+       int ret = 0;
+       struct hfi1_pio_header *phdr = &tx->phdr;
+       u16 hdrbytes = tx->hdr_dwords << 2;
+
+       if (!ahdr->ahgcount) {
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       hdrbytes + length,
+                       ahdr->ahgidx,
+                       0,
+                       NULL,
+                       0,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+               phdr->pbc = cpu_to_le64(pbc);
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       phdr,
+                       hdrbytes);
+               if (ret)
+                       goto bail_txadd;
+       } else {
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       length,
+                       ahdr->ahgidx,
+                       ahdr->ahgcount,
+                       ahdr->ahgdesc,
+                       hdrbytes,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+       }
+
+       /* add the ulp payload - if any.  ss can be NULL for acks */
+       if (ss)
+               ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+       return ret;
+}
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct ahg_ib_header *ahdr = priv->s_hdr;
+       u32 hdrwords = qp->s_hdrwords;
+       struct rvt_sge_state *ss = qp->s_cur_sge;
+       u32 len = qp->s_cur_size;
+       u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
+       struct hfi1_ibdev *dev = ps->dev;
+       struct hfi1_pportdata *ppd = ps->ppd;
+       struct verbs_txreq *tx;
+       u64 pbc_flags = 0;
+       u8 sc5 = priv->s_sc;
+
+       int ret;
+
+       tx = ps->s_txreq;
+       if (!sdma_txreq_built(&tx->txreq)) {
+               if (likely(pbc == 0)) {
+                       u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+                       /* No vl15 here */
+                       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+                       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+                       pbc = create_pbc(ppd,
+                                        pbc_flags,
+                                        qp->srate_mbps,
+                                        vl,
+                                        plen);
+               }
+               tx->wqe = qp->s_wqe;
+               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
+               if (unlikely(ret))
+                       goto bail_build;
+       }
+       ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
+       if (unlikely(ret < 0)) {
+               if (ret == -ECOMM)
+                       goto bail_ecomm;
+               return ret;
+       }
+       trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+                               &ps->s_txreq->phdr.hdr);
+       return ret;
+
+bail_ecomm:
+       /* The current one got "sent" */
+       return 0;
+bail_build:
+       ret = wait_kmem(dev, qp, ps);
+       if (!ret) {
+               /* free txreq - bad state */
+               hfi1_put_txreq(ps->s_txreq);
+               ps->s_txreq = NULL;
+       }
+       return ret;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int pio_wait(struct rvt_qp *qp,
+                   struct send_context *sc,
+                   struct hfi1_pkt_state *ps,
+                   u32 flag)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       unsigned long flags;
+       int ret = 0;
+
+       /*
+        * Note that as soon as want_buffer() is called and
+        * possibly before it returns, sc_piobufavail()
+        * could be called. Therefore, put QP on the I/O wait list before
+        * enabling the PIO avail interrupt.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               list_add_tail(&ps->s_txreq->txreq.list,
+                             &priv->s_iowait.tx_head);
+               if (list_empty(&priv->s_iowait.list)) {
+                       struct hfi1_ibdev *dev = &dd->verbs_dev;
+                       int was_empty;
+
+                       dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
+                       dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
+                       qp->s_flags |= flag;
+                       was_empty = list_empty(&sc->piowait);
+                       list_add_tail(&priv->s_iowait.list, &sc->piowait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
+                       atomic_inc(&qp->refcount);
+                       /* counting: only call wantpiobuf_intr if first user */
+                       if (was_empty)
+                               hfi1_sc_wantpiobuf_intr(sc, 1);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~RVT_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+static void verbs_pio_complete(void *arg, int code)
+{
+       struct rvt_qp *qp = (struct rvt_qp *)arg;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       if (iowait_pio_dec(&priv->s_iowait))
+               iowait_drain_wakeup(&priv->s_iowait);
+}
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc)
+{
+       struct hfi1_qp_priv *priv = qp->priv;
+       u32 hdrwords = qp->s_hdrwords;
+       struct rvt_sge_state *ss = qp->s_cur_sge;
+       u32 len = qp->s_cur_size;
+       u32 dwords = (len + 3) >> 2;
+       u32 plen = hdrwords + dwords + 2; /* includes pbc */
+       struct hfi1_pportdata *ppd = ps->ppd;
+       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
+       u64 pbc_flags = 0;
+       u8 sc5;
+       unsigned long flags = 0;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       int wc_status = IB_WC_SUCCESS;
+       int ret = 0;
+       pio_release_cb cb = NULL;
+
+       /* only RC/UC use complete */
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+       case IB_QPT_UC:
+               cb = verbs_pio_complete;
+               break;
+       default:
+               break;
+       }
+
+       /* vl15 special case taken care of in ud.c */
+       sc5 = priv->s_sc;
+       sc = ps->s_txreq->psc;
+
+       if (likely(pbc == 0)) {
+               u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       }
+       if (cb)
+               iowait_pio_inc(&priv->s_iowait);
+       pbuf = sc_buffer_alloc(sc, plen, cb, qp);
+       if (unlikely(!pbuf)) {
+               if (cb)
+                       verbs_pio_complete(qp, 0);
+               if (ppd->host_link_state != HLS_UP_ACTIVE) {
+                       /*
+                        * If we have filled the PIO buffers to capacity and are
+                        * not in an active state this request is not going to
+                        * go out to so just complete it with an error or else a
+                        * ULP or the core may be stuck waiting.
+                        */
+                       hfi1_cdbg(
+                               PIO,
+                               "alloc failed. state not active, completing");
+                       wc_status = IB_WC_GENERAL_ERR;
+                       goto pio_bail;
+               } else {
+                       /*
+                        * This is a normal occurrence. The PIO buffs are full
+                        * up but we are still happily sending, well we could be
+                        * so lets continue to queue the request.
+                        */
+                       hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+                       ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
+                       if (!ret)
+                               /* txreq not queued - free */
+                               goto bail;
+                       /* tx consumed in wait */
+                       return ret;
+               }
+       }
+
+       if (len == 0) {
+               pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+       } else {
+               if (ss) {
+                       seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4);
+                       while (len) {
+                               void *addr = ss->sge.vaddr;
+                               u32 slen = ss->sge.length;
+
+                               if (slen > len)
+                                       slen = len;
+                               update_sge(ss, slen);
+                               seg_pio_copy_mid(pbuf, addr, slen);
+                               len -= slen;
+                       }
+                       seg_pio_copy_end(pbuf);
+               }
+       }
+
+       trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+                              &ps->s_txreq->phdr.hdr);
+
+pio_bail:
+       if (qp->s_wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_send_complete(qp, qp->s_wqe, wc_status);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+
+       ret = 0;
+
+bail:
+       hfi1_put_txreq(ps->s_txreq);
+       return ret;
+}
+
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 mentry = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == mentry) {
+               /*
+                * If pkey[15] is set (full partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (pkey & PKEY_MEMBER_MASK)
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:    Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5:    SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
+ */
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+                     u8 sc5, int8_t s_pkey_index)
+{
+       struct hfi1_devdata *dd;
+       int i;
+       u16 pkey;
+       int is_user_ctxt_mechanism = (s_pkey_index < 0);
+
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+               return 0;
+
+       pkey = (u16)be32_to_cpu(bth[0]);
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /*
+        * For the kernel contexts only, if a qp is passed into the function,
+        * the most likely matching pkey has index qp->s_pkey_index
+        */
+       if (!is_user_ctxt_mechanism &&
+           egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+               return 0;
+       }
+
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
+bad:
+       /*
+        * For the user-context mechanism, the P_KEY check would only happen
+        * once per SDMA request, not once per packet.  Therefore, there's no
+        * need to increment the counter for the user-context mechanism.
+        */
+       if (!is_user_ctxt_mechanism) {
+               incr_cntr64(&ppd->port_xmit_constraint_errors);
+               dd = ppd->dd;
+               if (!(dd->err_info_xmit_constraint.status &
+                     OPA_EI_STATUS_SMASK)) {
+                       u16 slid = be16_to_cpu(lrh[3]);
+
+                       dd->err_info_xmit_constraint.status |=
+                               OPA_EI_STATUS_SMASK;
+                       dd->err_info_xmit_constraint.slid = slid;
+                       dd->err_info_xmit_constraint.pkey = pkey;
+               }
+       }
+       return 1;
+}
+
+/**
+ * get_send_routine - choose an egress routine
+ *
+ * Choose an egress routine based on QP type
+ * and size
+ */
+static inline send_routine get_send_routine(struct rvt_qp *qp,
+                                           struct verbs_txreq *tx)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_ib_header *h = &tx->phdr.hdr;
+
+       if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
+               return dd->process_pio_send;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+               return dd->process_pio_send;
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               break;
+       case IB_QPT_RC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
+                   !sdma_txreq_built(&tx->txreq))
+                       return dd->process_pio_send;
+               break;
+       case IB_QPT_UC:
+               if (piothreshold &&
+                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
+                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
+                   !sdma_txreq_built(&tx->txreq))
+                       return dd->process_pio_send;
+               break;
+       default:
+               break;
+       }
+       return dd->process_dma_send;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ps: the state of the packet to send
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ib_header *hdr;
+       send_routine sr;
+       int ret;
+       u8 lnh;
+
+       hdr = &ps->s_txreq->phdr.hdr;
+       /* locate the pkey within the headers */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               ohdr = &hdr->u.oth;
+
+       sr = get_send_routine(qp, ps->s_txreq);
+       ret = egress_pkey_check(dd->pport,
+                               hdr->lrh,
+                               ohdr->bth,
+                               priv->s_sc,
+                               qp->s_pkey_index);
+       if (unlikely(ret)) {
+               /*
+                * The value we are returning here does not get propagated to
+                * the verbs caller. Thus we need to complete the request with
+                * error otherwise the caller could be sitting waiting on the
+                * completion event. Only do this for PIO. SDMA has its own
+                * mechanism for handling the errors. So for SDMA we can just
+                * return.
+                */
+               if (sr == dd->process_pio_send) {
+                       unsigned long flags;
+
+                       hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+                                 __func__);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               return -EINVAL;
+       }
+       if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
+               return pio_wait(qp,
+                               ps->s_txreq->psc,
+                               ps,
+                               RVT_S_WAIT_PIO_DRAIN);
+       return sr(qp, ps, 0);
+}
+
+/**
+ * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
+ * @dd: the device data structure
+ */
+static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
+{
+       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+       memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
+
+       rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+                       IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+                       IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+       rdi->dparms.props.page_size_cap = PAGE_SIZE;
+       rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+       rdi->dparms.props.vendor_part_id = dd->pcidev->device;
+       rdi->dparms.props.hw_ver = dd->minrev;
+       rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
+       rdi->dparms.props.max_mr_size = ~0ULL;
+       rdi->dparms.props.max_qp = hfi1_max_qps;
+       rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+       rdi->dparms.props.max_sge = hfi1_max_sges;
+       rdi->dparms.props.max_sge_rd = hfi1_max_sges;
+       rdi->dparms.props.max_cq = hfi1_max_cqs;
+       rdi->dparms.props.max_ah = hfi1_max_ahs;
+       rdi->dparms.props.max_cqe = hfi1_max_cqes;
+       rdi->dparms.props.max_mr = rdi->lkey_table.max;
+       rdi->dparms.props.max_fmr = rdi->lkey_table.max;
+       rdi->dparms.props.max_map_per_fmr = 32767;
+       rdi->dparms.props.max_pd = hfi1_max_pds;
+       rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+       rdi->dparms.props.max_qp_init_rd_atom = 255;
+       rdi->dparms.props.max_srq = hfi1_max_srqs;
+       rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
+       rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
+       rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
+       rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
+       rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
+       rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+       rdi->dparms.props.max_total_mcast_qp_attach =
+                                       rdi->dparms.props.max_mcast_qp_attach *
+                                       rdi->dparms.props.max_mcast_grp;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+       u16 out = 0;
+
+       if (in & OPA_LINK_SPEED_25G)
+               out |= IB_SPEED_EDR;
+       if (in & OPA_LINK_SPEED_12_5G)
+               out |= IB_SPEED_FDR;
+
+       return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+       switch (in) {
+       case OPA_LINK_WIDTH_1X:
+       /* map 2x and 3x to 1x as they don't exist in IB */
+       case OPA_LINK_WIDTH_2X:
+       case OPA_LINK_WIDTH_3X:
+               return IB_WIDTH_1X;
+       default: /* link down or unknown, return our largest width */
+       case OPA_LINK_WIDTH_4X:
+               return IB_WIDTH_4X;
+       }
+}
+
+static int query_port(struct rvt_dev_info *rdi, u8 port_num,
+                     struct ib_port_attr *props)
+{
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+       u16 lid = ppd->lid;
+
+       props->lid = lid ? lid : 0;
+       props->lmc = ppd->lmc;
+       /* OPA logical states match IB logical states */
+       props->state = driver_lstate(ppd);
+       props->phys_state = hfi1_ibphys_portstate(ppd);
+       props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+       props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+       /* see rate_show() in ib core/sysfs.c */
+       props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+       props->max_vl_num = ppd->vls_supported;
+
+       /* Once we are a "first class" citizen and have added the OPA MTUs to
+        * the core we can advertise the larger MTU enum to the ULPs, for now
+        * advertise only 4K.
+        *
+        * Those applications which are either OPA aware or pass the MTU enum
+        * from the Path Records to us will get the new 8k MTU.  Those that
+        * attempt to process the MTU enum may fail in various ways.
+        */
+       props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+                                     4096 : hfi1_max_mtu), IB_MTU_4096);
+       props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+               mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+
+       return 0;
+}
+
+static int modify_device(struct ib_device *device,
+                        int device_modify_mask,
+                        struct ib_device_modify *device_modify)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(device);
+       unsigned i;
+       int ret;
+
+       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+                                  IB_DEVICE_MODIFY_NODE_DESC)) {
+               ret = -EOPNOTSUPP;
+               goto bail;
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               memcpy(device->node_desc, device_modify->node_desc, 64);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_node_desc_chg(ibp);
+               }
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+               ib_hfi1_sys_image_guid =
+                       cpu_to_be64(device_modify->sys_image_guid);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_sys_guid_chg(ibp);
+               }
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+{
+       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+       int ret;
+
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+                            OPA_LINKDOWN_REASON_UNKNOWN);
+       ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+       return ret;
+}
+
+static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+                           int guid_index, __be64 *guid)
+{
+       struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       if (guid_index == 0)
+               *guid = cpu_to_be64(ppd->guid);
+       else if (guid_index < HFI1_GUIDS_PER_PORT)
+               *guid = ibp->guids[guid_index - 1];
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+       return ibp->sl_to_sc[ah->sl];
+}
+
+static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u8 sc5;
+
+       /* test the mapping for validity */
+       ibp = to_iport(ibdev, ah_attr->port_num);
+       ppd = ppd_from_ibp(ibp);
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       dd = dd_from_ppd(ppd);
+       if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+               return -EINVAL;
+       return 0;
+}
+
+static void hfi1_notify_new_ah(struct ib_device *ibdev,
+                              struct ib_ah_attr *ah_attr,
+                              struct rvt_ah *ah)
+{
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u8 sc5;
+
+       /*
+        * Do not trust reading anything from rvt_ah at this point as it is not
+        * done being setup. We can however modify things which we need to set.
+        */
+
+       ibp = to_iport(ibdev, ah_attr->port_num);
+       ppd = ppd_from_ibp(ibp);
+       sc5 = ibp->sl_to_sc[ah->attr.sl];
+       dd = dd_from_ppd(ppd);
+       ah->vl = sc_to_vlt(dd, sc5);
+       if (ah->vl < num_vls || ah->vl == 15)
+               ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+       struct ib_ah_attr attr;
+       struct ib_ah *ah = ERR_PTR(-EINVAL);
+       struct rvt_qp *qp0;
+
+       memset(&attr, 0, sizeof(attr));
+       attr.dlid = dlid;
+       attr.port_num = ppd_from_ibp(ibp)->port;
+       rcu_read_lock();
+       qp0 = rcu_dereference(ibp->rvp.qp[0]);
+       if (qp0)
+               ah = ib_create_ah(qp0->ibqp.pd, &attr);
+       rcu_read_unlock();
+       return ah;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+       return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+       int i;
+
+       for (i = 0; i < sz; i++) {
+               ibp->sl_to_sc[i] = i;
+               ibp->sc_to_sl[i] = i;
+       }
+
+       spin_lock_init(&ibp->rvp.lock);
+       /* Set the prefix to the default value (see ch. 4.1.1) */
+       ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
+       ibp->rvp.sm_lid = 0;
+       /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+       ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+               IB_PORT_CAP_MASK_NOTICE_SUP;
+       ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+       ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+       ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+       ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+       ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+       RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
+       RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct ib_device *ibdev = &dev->rdi.ibdev;
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned i;
+       int ret;
+       size_t lcpysz = IB_DEVICE_NAME_MAX;
+
+       for (i = 0; i < dd->num_pports; i++)
+               init_ibport(ppd + i);
+
+       /* Only need to initialize non-zero fields. */
+
+       setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
+
+       seqlock_init(&dev->iowait_lock);
+       INIT_LIST_HEAD(&dev->txwait);
+       INIT_LIST_HEAD(&dev->memwait);
+
+       ret = verbs_txreq_init(dev);
+       if (ret)
+               goto err_verbs_txreq;
+
+       /*
+        * The system image GUID is supposed to be the same for all
+        * HFIs in a single system but since there can be other
+        * device types in the system, we can't be sure this is unique.
+        */
+       if (!ib_hfi1_sys_image_guid)
+               ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+       lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+       strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+       ibdev->owner = THIS_MODULE;
+       ibdev->node_guid = cpu_to_be64(ppd->guid);
+       ibdev->phys_port_cnt = dd->num_pports;
+       ibdev->dma_device = &dd->pcidev->dev;
+       ibdev->modify_device = modify_device;
+
+       /* keep process mad in the driver */
+       ibdev->process_mad = hfi1_process_mad;
+
+       strncpy(ibdev->node_desc, init_utsname()->nodename,
+               sizeof(ibdev->node_desc));
+
+       /*
+        * Fill in rvt info object.
+        */
+       dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
+       dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
+       dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
+       dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
+       dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
+       dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
+       dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
+       dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
+       dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
+       /*
+        * Fill in rvt info device attributes.
+        */
+       hfi1_fill_device_attr(dd);
+
+       /* queue pair */
+       dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
+       dd->verbs_dev.rdi.dparms.qpn_start = 0;
+       dd->verbs_dev.rdi.dparms.qpn_inc = 1;
+       dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
+       dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
+       dd->verbs_dev.rdi.dparms.qpn_res_end =
+       dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
+       dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
+       dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
+       dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
+       dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
+       dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+       dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
+
+       dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+       dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
+       dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
+       dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
+       dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
+       dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
+       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
+       dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
+       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+       dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
+       dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
+       dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
+       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+       dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
+       dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
+       dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
+       dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
+       dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+
+       /* completeion queue */
+       snprintf(dd->verbs_dev.rdi.dparms.cq_name,
+                sizeof(dd->verbs_dev.rdi.dparms.cq_name),
+                "hfi1_cq%d", dd->unit);
+       dd->verbs_dev.rdi.dparms.node = dd->node;
+
+       /* misc settings */
+       dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
+       dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
+       dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
+       dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++)
+               rvt_init_port(&dd->verbs_dev.rdi,
+                             &ppd->ibport_data.rvp,
+                             i,
+                             ppd->pkeys);
+
+       ret = rvt_register_device(&dd->verbs_dev.rdi);
+       if (ret)
+               goto err_verbs_txreq;
+
+       ret = hfi1_verbs_register_sysfs(dd);
+       if (ret)
+               goto err_class;
+
+       return ret;
+
+err_class:
+       rvt_unregister_device(&dd->verbs_dev.rdi);
+err_verbs_txreq:
+       verbs_txreq_exit(dev);
+       dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+       return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+
+       hfi1_verbs_unregister_sysfs(dd);
+
+       rvt_unregister_device(&dd->verbs_dev.rdi);
+
+       if (!list_empty(&dev->txwait))
+               dd_dev_err(dd, "txwait list not empty!\n");
+       if (!list_empty(&dev->memwait))
+               dd_dev_err(dd, "memwait list not empty!\n");
+
+       del_timer_sync(&dev->mem_timer);
+       verbs_txreq_exit(dev);
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_ib_header *hdr = packet->hdr;
+       struct rvt_qp *qp = packet->qp;
+       u32 lqpn, rqpn = 0;
+       u16 rlid = 0;
+       u8 sl, sc5, sc4_bit, svc_type;
+       bool sc4_set = has_sc4_bit(packet);
+
+       switch (packet->qp->ibqp.qp_type) {
+       case IB_QPT_UC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_UC;
+               break;
+       case IB_QPT_RC:
+               rlid = qp->remote_ah_attr.dlid;
+               rqpn = qp->remote_qpn;
+               svc_type = IB_CC_SVCTYPE_RC;
+               break;
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               svc_type = IB_CC_SVCTYPE_UD;
+               break;
+       default:
+               ibp->rvp.n_pkt_drops++;
+               return;
+       }
+
+       sc4_bit = sc4_set << 4;
+       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       sc5 |= sc4_bit;
+       sl = ibp->sc_to_sl[sc5];
+       lqpn = qp->ibqp.qp_num;
+
+       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
new file mode 100644 (file)
index 0000000..4883567
--- /dev/null
@@ -0,0 +1,531 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/rdmavt_cq.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC     16
+#define HFI1_GUIDS_PER_PORT    5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION       2
+
+#define IB_SEQ_NAK     (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG                cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK         BIT(31)
+#define IB_BTH_SOLICITED       BIT(23)
+#define IB_BTH_MIG_REQ         BIT(22)
+
+#define IB_GRH_VERSION         6
+#define IB_GRH_VERSION_MASK    0xF
+#define IB_GRH_VERSION_SHIFT   28
+#define IB_GRH_TCLASS_MASK     0xFF
+#define IB_GRH_TCLASS_SHIFT    20
+#define IB_GRH_FLOW_MASK       0xFFFFF
+#define IB_GRH_FLOW_SHIFT      0
+#define IB_GRH_NEXT_HDR                0x1B
+
+#define IB_DEFAULT_GID_PREFIX  cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+       HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+       __be32 rkey;
+       __be64 swap_data;
+       __be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+       struct {
+               __be32 deth[2];
+               __be32 imm_data;
+       } ud;
+       struct {
+               struct ib_reth reth;
+               __be32 imm_data;
+       } rc;
+       struct {
+               __be32 aeth;
+               __be32 atomic_ack_eth[2];
+       } at;
+       __be32 imm_data;
+       __be32 aeth;
+       __be32 ieth;
+       struct ib_atomic_eth atomic_eth;
+}  __packed;
+
+struct hfi1_other_headers {
+       __be32 bth[3];
+       union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+       __be16 lrh[4];
+       union {
+               struct {
+                       struct ib_grh grh;
+                       struct hfi1_other_headers oth;
+               } l;
+               struct hfi1_other_headers oth;
+       } u;
+} __packed;
+
+struct ahg_ib_header {
+       struct sdma_engine *sde;
+       u32 ahgdesc[2];
+       u16 tx_flags;
+       u8 ahgcount;
+       u8 ahgidx;
+       struct hfi1_ib_header ibh;
+};
+
+struct hfi1_pio_header {
+       __le64 pbc;
+       struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * hfi1 specific data structures that will be hidden from rvt after the queue
+ * pair is made common
+ */
+struct hfi1_qp_priv {
+       struct ahg_ib_header *s_hdr;              /* next header to send */
+       struct sdma_engine *s_sde;                /* current sde */
+       struct send_context *s_sendcontext;       /* current sendcontext */
+       u8 s_sc;                                  /* SC[0..4] for next packet */
+       u8 r_adefered;                            /* number of acks defered */
+       struct iowait s_iowait;
+       struct timer_list s_rnr_timer;
+       struct rvt_qp *owner;
+};
+
+/*
+ * This structure is used to hold commonly lookedup and computed values during
+ * the send engine progress.
+ */
+struct hfi1_pkt_state {
+       struct hfi1_ibdev *dev;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct verbs_txreq *s_txreq;
+       unsigned long flags;
+};
+
+#define HFI1_PSN_CREDIT  16
+
+struct hfi1_opcode_stats {
+       u64 n_packets;          /* number of packets */
+       u64 n_bytes;            /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+       struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+       u32 tlen,
+       struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+       stats->n_bytes += tlen;
+       stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+       struct rvt_qp __rcu *qp[2];
+       struct rvt_ibport rvp;
+
+       __be64 guids[HFI1_GUIDS_PER_PORT        - 1];   /* writable GUIDs */
+
+       /* the first 16 entries are sl_to_vl for !OPA */
+       u8 sl_to_sc[32];
+       u8 sc_to_sl[32];
+};
+
+struct hfi1_ibdev {
+       struct rvt_dev_info rdi; /* Must be first */
+
+       /* QP numbers are shared by all IB ports */
+       /* protect wait lists */
+       seqlock_t iowait_lock;
+       struct list_head txwait;        /* list for wait verbs_txreq */
+       struct list_head memwait;       /* list for wait kernel memory */
+       struct list_head txreq_free;
+       struct kmem_cache *verbs_txreq_cache;
+       struct timer_list mem_timer;
+
+       u64 n_piowait;
+       u64 n_piodrain;
+       u64 n_txwait;
+       u64 n_kmem_wait;
+
+#ifdef CONFIG_DEBUG_FS
+       /* per HFI debugfs */
+       struct dentry *hfi1_ibdev_dbg;
+       /* per HFI symlinks to above */
+       struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+       struct rvt_dev_info *rdi;
+
+       rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
+       return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
+{
+       struct hfi1_qp_priv *priv;
+
+       priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
+       return priv->owner;
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+       return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
+               (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
+                !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+       return (((int)a) - ((int)b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+       return (((int)a) - ((int)b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+       return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+       return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
+                  int release, int copy_last);
+
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct rvt_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to);
+void hfi1_rc_timeout(unsigned long arg);
+void hfi1_del_timers_sync(struct rvt_qp *qp);
+void hfi1_stop_rc_timers(struct rvt_qp *qp);
+
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                        int attr_mask, struct ib_udata *udata);
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata);
+
+int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+extern const u32 rc_only_opcode;
+extern const u32 uc_only_opcode;
+
+static inline u8 get_opcode(struct hfi1_ib_header *h)
+{
+       u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
+
+       if (lnh == IB_LNH_IBA_LOCAL)
+               return be32_to_cpu(h->u.oth.bth[0]) >> 24;
+       else
+               return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
+}
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct rvt_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle,
+                         struct hfi1_pkt_state *ps);
+
+void _hfi1_do_send(struct work_struct *work);
+
+void hfi1_do_send(struct rvt_qp *qp);
+
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+                       enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc);
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+                       u64 pbc);
+
+int hfi1_wss_init(void);
+void hfi1_wss_exit(void);
+
+/* platform specific: return the lowest level cache (llc) size, in KiB */
+static inline int wss_llc_size(void)
+{
+       /* assume that the boot CPU value is universal for all CPUs */
+       return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static inline void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+       /*
+        * Use the only available X64 cacheless copy.  Add a __user cast
+        * to quiet sparse.  The src agument is already in the kernel so
+        * there are no security issues.  The extra fault recovery machinery
+        * is not invoked.
+        */
+       __copy_user_nocache(dst, (void __user *)src, n, 0);
+}
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_rvt_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern unsigned short piothreshold;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
new file mode 100644 (file)
index 0000000..bc95c41
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+#include "trace.h"
+
+#define TXREQ_LEN 24
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+       struct hfi1_ibdev *dev;
+       struct rvt_qp *qp;
+       unsigned long flags;
+       unsigned int seq;
+       struct hfi1_qp_priv *priv;
+
+       qp = tx->qp;
+       dev = to_idev(qp->ibqp.device);
+
+       if (tx->mr)
+               rvt_put_mr(tx->mr);
+
+       sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+       /* Free verbs_txreq and return to slab cache */
+       kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&dev->txwait)) {
+                       struct iowait *wait;
+
+                       write_seqlock_irqsave(&dev->iowait_lock, flags);
+                       wait = list_first_entry(&dev->txwait, struct iowait,
+                                               list);
+                       qp = iowait_to_qp(wait);
+                       priv = qp->priv;
+                       list_del_init(&priv->s_iowait.list);
+                       /* refcount held until actual wake up */
+                       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+                       hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+                               struct rvt_qp *qp)
+{
+       struct verbs_txreq *tx = ERR_PTR(-EBUSY);
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       write_seqlock(&dev->iowait_lock);
+       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+               struct hfi1_qp_priv *priv;
+
+               tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+               if (tx)
+                       goto out;
+               priv = qp->priv;
+               if (list_empty(&priv->s_iowait.list)) {
+                       dev->n_txwait++;
+                       qp->s_flags |= RVT_S_WAIT_TX;
+                       list_add_tail(&priv->s_iowait.list, &dev->txwait);
+                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
+                       atomic_inc(&qp->refcount);
+               }
+               qp->s_flags &= ~RVT_S_BUSY;
+       }
+out:
+       write_sequnlock(&dev->iowait_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return tx;
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+       struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+int verbs_txreq_init(struct hfi1_ibdev *dev)
+{
+       char buf[TXREQ_LEN];
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
+       dev->verbs_txreq_cache = kmem_cache_create(buf,
+                                                  sizeof(struct verbs_txreq),
+                                                  0, SLAB_HWCACHE_ALIGN,
+                                                  verbs_txreq_kmem_cache_ctor);
+       if (!dev->verbs_txreq_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void verbs_txreq_exit(struct hfi1_ibdev *dev)
+{
+       kmem_cache_destroy(dev->verbs_txreq_cache);
+       dev->verbs_txreq_cache = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
new file mode 100644 (file)
index 0000000..1cf69b2
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_TXREQ_H
+#define HFI1_VERBS_TXREQ_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "verbs.h"
+#include "sdma_txreq.h"
+#include "iowait.h"
+
+struct verbs_txreq {
+       struct hfi1_pio_header  phdr;
+       struct sdma_txreq       txreq;
+       struct rvt_qp           *qp;
+       struct rvt_swqe         *wqe;
+       struct rvt_mregion      *mr;
+       struct rvt_sge_state    *ss;
+       struct sdma_engine     *sde;
+       struct send_context     *psc;
+       u16                     hdr_dwords;
+};
+
+struct hfi1_ibdev;
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+                               struct rvt_qp *qp);
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+                                           struct rvt_qp *qp)
+{
+       struct verbs_txreq *tx;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+       if (unlikely(!tx)) {
+               /* call slow path to get the lock */
+               tx = __get_txreq(dev, qp);
+               if (IS_ERR(tx))
+                       return tx;
+       }
+       tx->qp = qp;
+       tx->mr = NULL;
+       tx->sde = priv->s_sde;
+       tx->psc = priv->s_sendcontext;
+       /* so that we can test if the sdma decriptors are there */
+       tx->txreq.num_desc = 0;
+       return tx;
+}
+
+static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
+{
+       return &tx->txreq;
+}
+
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+{
+       struct sdma_txreq *stx;
+       struct hfi1_qp_priv *priv = qp->priv;
+
+       stx = iowait_get_txhead(&priv->s_iowait);
+       if (stx)
+               return container_of(stx, struct verbs_txreq, txreq);
+       return NULL;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx);
+int verbs_txreq_init(struct hfi1_ibdev *dev);
+void verbs_txreq_exit(struct hfi1_ibdev *dev);
+
+#endif                         /* HFI1_VERBS_TXREQ_H */
index 82d7c4b..ce40340 100644 (file)
@@ -1308,21 +1308,6 @@ static const struct  qib_hwerror_msgs qib_7322p_error_msgs[] = {
        SYM_LSB(IntMask, fldname##17IntMask)), \
        .msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
 
-static const struct  qib_hwerror_msgs qib_7322_intr_msgs[] = {
-       INTR_AUTO_P(SDmaInt),
-       INTR_AUTO_P(SDmaProgressInt),
-       INTR_AUTO_P(SDmaIdleInt),
-       INTR_AUTO_P(SDmaCleanupDone),
-       INTR_AUTO_C(RcvUrg),
-       INTR_AUTO_P(ErrInt),
-       INTR_AUTO(ErrInt),      /* non-port-specific errs */
-       INTR_AUTO(AssertGPIOInt),
-       INTR_AUTO_P(SendDoneInt),
-       INTR_AUTO(SendBufAvailInt),
-       INTR_AUTO_C(RcvAvail),
-       { .mask = 0, .sz = 0 }
-};
-
 #define TXSYMPTOM_AUTO_P(fldname) \
        { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
        .msg = #fldname, .sz = sizeof(#fldname) }
index 6888f03..4f87815 100644 (file)
@@ -159,6 +159,7 @@ struct qib_other_headers {
                } at;
                __be32 imm_data;
                __be32 aeth;
+               __be32 ieth;
                struct ib_atomic_eth atomic_eth;
        } u;
 } __packed;
index b1ffc8b..6ca6fa8 100644 (file)
@@ -525,6 +525,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
                return PTR_ERR(task);
        }
 
+       set_user_nice(task, MIN_NICE);
        cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
        kthread_bind(task, cpu);
        wake_up_process(task);
index 0ff765b..0f4d450 100644 (file)
@@ -124,11 +124,13 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
                            int count)
 {
        int m, i = 0;
+       struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 
        mr->mapsz = 0;
        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
        for (; i < m; i++) {
-               mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+               mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+                                         dev->dparms.node);
                if (!mr->map[i]) {
                        rvt_deinit_mregion(mr);
                        return -ENOMEM;
index 0f12c21..5fa4d4d 100644 (file)
@@ -397,6 +397,7 @@ static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
 static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 {
        unsigned n;
+       struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 
        if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
                rvt_put_ss(&qp->s_rdma_read_sge);
@@ -431,7 +432,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
        if (qp->ibqp.qp_type != IB_QPT_RC)
                return;
 
-       for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+       for (n = 0; n < rvt_max_atomic(rdi); n++) {
                struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
                if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
@@ -569,7 +570,12 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
        qp->s_ssn = 1;
        qp->s_lsn = 0;
        qp->s_mig_state = IB_MIG_MIGRATED;
-       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+       if (qp->s_ack_queue)
+               memset(
+                       qp->s_ack_queue,
+                       0,
+                       rvt_max_atomic(rdi) *
+                               sizeof(*qp->s_ack_queue));
        qp->r_head_ack_queue = 0;
        qp->s_tail_ack_queue = 0;
        qp->s_num_rd_atomic = 0;
@@ -653,9 +659,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                if (gfp == GFP_NOIO)
                        swq = __vmalloc(
                                (init_attr->cap.max_send_wr + 1) * sz,
-                               gfp, PAGE_KERNEL);
+                               gfp | __GFP_ZERO, PAGE_KERNEL);
                else
-                       swq = vmalloc_node(
+                       swq = vzalloc_node(
                                (init_attr->cap.max_send_wr + 1) * sz,
                                rdi->dparms.node);
                if (!swq)
@@ -677,6 +683,16 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                        goto bail_swq;
 
                RCU_INIT_POINTER(qp->next, NULL);
+               if (init_attr->qp_type == IB_QPT_RC) {
+                       qp->s_ack_queue =
+                               kzalloc_node(
+                                       sizeof(*qp->s_ack_queue) *
+                                        rvt_max_atomic(rdi),
+                                       gfp,
+                                       rdi->dparms.node);
+                       if (!qp->s_ack_queue)
+                               goto bail_qp;
+               }
 
                /*
                 * Driver needs to set up it's private QP structure and do any
@@ -704,9 +720,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                                qp->r_rq.wq = __vmalloc(
                                                sizeof(struct rvt_rwq) +
                                                qp->r_rq.size * sz,
-                                               gfp, PAGE_KERNEL);
+                                               gfp | __GFP_ZERO, PAGE_KERNEL);
                        else
-                               qp->r_rq.wq = vmalloc_node(
+                               qp->r_rq.wq = vzalloc_node(
                                                sizeof(struct rvt_rwq) +
                                                qp->r_rq.size * sz,
                                                rdi->dparms.node);
@@ -857,6 +873,7 @@ bail_driver_priv:
        rdi->driver_f.qp_priv_free(rdi, qp);
 
 bail_qp:
+       kfree(qp->s_ack_queue);
        kfree(qp);
 
 bail_swq:
@@ -1284,6 +1301,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
                vfree(qp->r_rq.wq);
        vfree(qp->s_wq);
        rdi->driver_f.qp_priv_free(rdi, qp);
+       kfree(qp->s_ack_queue);
        kfree(qp);
        return 0;
 }
index f1f3eca..2c5b018 100644 (file)
@@ -22,6 +22,4 @@ menuconfig STAGING_RDMA
 # Please keep entries in alphabetic order
 if STAGING_RDMA
 
-source "drivers/staging/rdma/hfi1/Kconfig"
-
 endif
index 8c7fc1d..b5e94f1 100644 (file)
@@ -1,2 +1 @@
 # Entries for RDMA_STAGING tree
-obj-$(CONFIG_INFINIBAND_HFI1)  += hfi1/
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
deleted file mode 100644 (file)
index a925fb0..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-config INFINIBAND_HFI1
-       tristate "Intel OPA Gen1 support"
-       depends on X86_64 && INFINIBAND_RDMAVT
-       select MMU_NOTIFIER
-       select CRC32
-       default m
-       ---help---
-       This is a low-level driver for Intel OPA Gen1 adapter.
-config HFI1_DEBUG_SDMA_ORDER
-       bool "HFI1 SDMA Order debug"
-       depends on INFINIBAND_HFI1
-       default n
-       ---help---
-       This is a debug flag to test for out of order
-       sdma completions for unit testing
-config HFI1_VERBS_31BIT_PSN
-       bool "HFI1 enable 31 bit PSN"
-       depends on INFINIBAND_HFI1
-       default y
-       ---help---
-       Setting this enables 31 BIT PSN
-       For verbs RC/UC
-config SDMA_VERBOSITY
-       bool "Config SDMA Verbosity"
-       depends on INFINIBAND_HFI1
-       default n
-       ---help---
-       This is a configuration flag to enable verbose
-       SDMA debug
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
deleted file mode 100644 (file)
index 8dc5938..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# HFI driver
-#
-#
-#
-# Called from the kernel module build system.
-#
-obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
-
-hfi1-y := affinity.o chip.o device.o diag.o driver.o efivar.o \
-       eprom.o file_ops.o firmware.o \
-       init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
-       uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
-       verbs_txreq.o
-hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
-
-CFLAGS_trace.o = -I$(src)
-ifdef MVERSION
-CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
-endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
deleted file mode 100644 (file)
index 4c6f1d7..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-July, 2015
-
-- Remove unneeded file entries in sysfs
-- Remove software processing of IB protocol and place in library for use
-  by qib, ipath (if still present), hfi1, and eventually soft-roce
-- Replace incorrect uAPI
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c
deleted file mode 100644 (file)
index 6e7050a..0000000
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/topology.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "affinity.h"
-#include "sdma.h"
-#include "trace.h"
-
-/* Name of IRQ types, indexed by enum irq_type */
-static const char * const irq_type_names[] = {
-       "SDMA",
-       "RCVCTXT",
-       "GENERAL",
-       "OTHER",
-};
-
-static inline void init_cpu_mask_set(struct cpu_mask_set *set)
-{
-       cpumask_clear(&set->mask);
-       cpumask_clear(&set->used);
-       set->gen = 0;
-}
-
-/* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
-{
-       struct hfi1_affinity *info;
-       int possible, curr_cpu, i, ht;
-
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
-
-       cpumask_clear(&info->real_cpu_mask);
-
-       /* Start with cpu online mask as the real cpu mask */
-       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
-
-       /*
-        * Remove HT cores from the real cpu mask.  Do this in two steps below.
-        */
-       possible = cpumask_weight(&info->real_cpu_mask);
-       ht = cpumask_weight(topology_sibling_cpumask(
-                                       cpumask_first(&info->real_cpu_mask)));
-       /*
-        * Step 1.  Skip over the first N HT siblings and use them as the
-        * "real" cores.  Assumes that HT cores are not enumerated in
-        * succession (except in the single core case).
-        */
-       curr_cpu = cpumask_first(&info->real_cpu_mask);
-       for (i = 0; i < possible / ht; i++)
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
-       /*
-        * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
-        * skip any gaps.
-        */
-       for (; i < possible; i++) {
-               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
-       }
-
-       dd->affinity = info;
-       return 0;
-}
-
-/*
- * Interrupt affinity.
- *
- * non-rcv avail gets a default mask that
- * starts as possible cpus with threads reset
- * and each rcv avail reset.
- *
- * rcv avail gets node relative 1 wrapping back
- * to the node relative 1 as necessary.
- *
- */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
-{
-       int node = pcibus_to_node(dd->pcidev->bus);
-       struct hfi1_affinity *info = dd->affinity;
-       const struct cpumask *local_mask;
-       int curr_cpu, possible, i;
-
-       if (node < 0)
-               node = numa_node_id();
-       dd->node = node;
-
-       spin_lock_init(&info->lock);
-
-       init_cpu_mask_set(&info->def_intr);
-       init_cpu_mask_set(&info->rcv_intr);
-       init_cpu_mask_set(&info->proc);
-
-       local_mask = cpumask_of_node(dd->node);
-       if (cpumask_first(local_mask) >= nr_cpu_ids)
-               local_mask = topology_core_cpumask(0);
-       /* Use the "real" cpu mask of this node as the default */
-       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-       /*  fill in the receive list */
-       possible = cpumask_weight(&info->def_intr.mask);
-       curr_cpu = cpumask_first(&info->def_intr.mask);
-       if (possible == 1) {
-               /*  only one CPU, everyone will use it */
-               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-       } else {
-               /*
-                * Retain the first CPU in the default list for the control
-                * context.
-                */
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-               /*
-                * Remove the remaining kernel receive queues from
-                * the default list and add them to the receive list.
-                */
-               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-                       if (curr_cpu >= nr_cpu_ids)
-                               break;
-               }
-       }
-
-       cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
-
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-       kfree(dd->affinity);
-}
-
-int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
-{
-       int ret;
-       cpumask_var_t diff;
-       struct cpu_mask_set *set;
-       struct sdma_engine *sde = NULL;
-       struct hfi1_ctxtdata *rcd = NULL;
-       char extra[64];
-       int cpu = -1;
-
-       extra[0] = '\0';
-       cpumask_clear(&msix->mask);
-
-       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
-       if (!ret)
-               return -ENOMEM;
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-               sde = (struct sdma_engine *)msix->arg;
-               scnprintf(extra, 64, "engine %u", sde->this_idx);
-               /* fall through */
-       case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
-               break;
-       case IRQ_RCVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       set = &dd->affinity->def_intr;
-                       cpu = cpumask_first(&set->mask);
-               } else {
-                       set = &dd->affinity->rcv_intr;
-               }
-               scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
-               break;
-       default:
-               dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
-               return -EINVAL;
-       }
-
-       /*
-        * The control receive context is placed on a particular CPU, which
-        * is set above.  Skip accounting for it.  Everything else finds its
-        * CPU here.
-        */
-       if (cpu == -1) {
-               spin_lock(&dd->affinity->lock);
-               if (cpumask_equal(&set->mask, &set->used)) {
-                       /*
-                        * We've used up all the CPUs, bump up the generation
-                        * and reset the 'used' map
-                        */
-                       set->gen++;
-                       cpumask_clear(&set->used);
-               }
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpu = cpumask_first(diff);
-               cpumask_set_cpu(cpu, &set->used);
-               spin_unlock(&dd->affinity->lock);
-       }
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-               sde->cpu = cpu;
-               break;
-       case IRQ_GENERAL:
-       case IRQ_RCVCTXT:
-       case IRQ_OTHER:
-               break;
-       }
-
-       cpumask_set_cpu(cpu, &msix->mask);
-       dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
-                   msix->msix.vector, irq_type_names[msix->type],
-                   extra, cpu);
-       irq_set_affinity_hint(msix->msix.vector, &msix->mask);
-
-       free_cpumask_var(diff);
-       return 0;
-}
-
-void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
-                          struct hfi1_msix_entry *msix)
-{
-       struct cpu_mask_set *set = NULL;
-       struct hfi1_ctxtdata *rcd;
-
-       switch (msix->type) {
-       case IRQ_SDMA:
-       case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
-               break;
-       case IRQ_RCVCTXT:
-               rcd = (struct hfi1_ctxtdata *)msix->arg;
-               /* only do accounting for non control contexts */
-               if (rcd->ctxt != HFI1_CTRL_CTXT)
-                       set = &dd->affinity->rcv_intr;
-               break;
-       default:
-               return;
-       }
-
-       if (set) {
-               spin_lock(&dd->affinity->lock);
-               cpumask_andnot(&set->used, &set->used, &msix->mask);
-               if (cpumask_empty(&set->used) && set->gen) {
-                       set->gen--;
-                       cpumask_copy(&set->used, &set->mask);
-               }
-               spin_unlock(&dd->affinity->lock);
-       }
-
-       irq_set_affinity_hint(msix->msix.vector, NULL);
-       cpumask_clear(&msix->mask);
-}
-
-int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
-{
-       int cpu = -1, ret;
-       cpumask_var_t diff, mask, intrs;
-       const struct cpumask *node_mask,
-               *proc_mask = tsk_cpus_allowed(current);
-       struct cpu_mask_set *set = &dd->affinity->proc;
-       char buf[1024];
-
-       /*
-        * check whether process/context affinity has already
-        * been set
-        */
-       if (cpumask_weight(proc_mask) == 1) {
-               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
-               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %s",
-                         current->pid, current->comm, buf);
-               /*
-                * Mark the pre-set CPU as used. This is atomic so we don't
-                * need the lock
-                */
-               cpu = cpumask_first(proc_mask);
-               cpumask_set_cpu(cpu, &set->used);
-               goto done;
-       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
-               scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask));
-               hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %s",
-                         current->pid, current->comm, buf);
-               goto done;
-       }
-
-       /*
-        * The process does not have a preset CPU affinity so find one to
-        * recommend. We prefer CPUs on the same NUMA as the device.
-        */
-
-       ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
-       if (!ret)
-               goto done;
-       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
-       if (!ret)
-               goto free_diff;
-       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
-       if (!ret)
-               goto free_mask;
-
-       spin_lock(&dd->affinity->lock);
-       /*
-        * If we've used all available CPUs, clear the mask and start
-        * overloading.
-        */
-       if (cpumask_equal(&set->mask, &set->used)) {
-               set->gen++;
-               cpumask_clear(&set->used);
-       }
-
-       /* CPUs used by interrupt handlers */
-       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-                            &dd->affinity->def_intr.mask :
-                            &dd->affinity->def_intr.used));
-       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-                                 &dd->affinity->rcv_intr.mask :
-                                 &dd->affinity->rcv_intr.used));
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(intrs));
-       hfi1_cdbg(PROC, "CPUs used by interrupts: %s", buf);
-
-       /*
-        * If we don't have a NUMA node requested, preference is towards
-        * device NUMA node
-        */
-       if (node == -1)
-               node = dd->node;
-       node_mask = cpumask_of_node(node);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(node_mask));
-       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %s", node, buf);
-
-       /* diff will hold all unused cpus */
-       cpumask_andnot(diff, &set->mask, &set->used);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(diff));
-       hfi1_cdbg(PROC, "unused CPUs (all) %s", buf);
-
-       /* get cpumask of available CPUs on preferred NUMA */
-       cpumask_and(mask, diff, node_mask);
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
-       hfi1_cdbg(PROC, "available cpus on NUMA %s", buf);
-
-       /*
-        * At first, we don't want to place processes on the same
-        * CPUs as interrupt handlers.
-        */
-       cpumask_andnot(diff, mask, intrs);
-       if (!cpumask_empty(diff))
-               cpumask_copy(mask, diff);
-
-       /*
-        * if we don't have a cpu on the preferred NUMA, get
-        * the list of the remaining available CPUs
-        */
-       if (cpumask_empty(mask)) {
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpumask_andnot(mask, diff, node_mask);
-       }
-       scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask));
-       hfi1_cdbg(PROC, "possible CPUs for process %s", buf);
-
-       cpu = cpumask_first(mask);
-       if (cpu >= nr_cpu_ids) /* empty */
-               cpu = -1;
-       else
-               cpumask_set_cpu(cpu, &set->used);
-       spin_unlock(&dd->affinity->lock);
-
-       free_cpumask_var(intrs);
-free_mask:
-       free_cpumask_var(mask);
-free_diff:
-       free_cpumask_var(diff);
-done:
-       return cpu;
-}
-
-void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
-{
-       struct cpu_mask_set *set = &dd->affinity->proc;
-
-       if (cpu < 0)
-               return;
-       spin_lock(&dd->affinity->lock);
-       cpumask_clear_cpu(cpu, &set->used);
-       if (cpumask_empty(&set->used) && set->gen) {
-               set->gen--;
-               cpumask_copy(&set->used, &set->mask);
-       }
-       spin_unlock(&dd->affinity->lock);
-}
-
diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h
deleted file mode 100644 (file)
index 20f52fe..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_AFFINITY_H
-#define _HFI1_AFFINITY_H
-
-#include "hfi.h"
-
-enum irq_type {
-       IRQ_SDMA,
-       IRQ_RCVCTXT,
-       IRQ_GENERAL,
-       IRQ_OTHER
-};
-
-/* Can be used for both memory and cpu */
-enum affinity_flags {
-       AFF_AUTO,
-       AFF_NUMA_LOCAL,
-       AFF_DEV_LOCAL,
-       AFF_IRQ_LOCAL
-};
-
-struct cpu_mask_set {
-       struct cpumask mask;
-       struct cpumask used;
-       uint gen;
-};
-
-struct hfi1_affinity {
-       struct cpu_mask_set def_intr;
-       struct cpu_mask_set rcv_intr;
-       struct cpu_mask_set proc;
-       struct cpumask real_cpu_mask;
-       /* spin lock to protect affinity struct */
-       spinlock_t lock;
-};
-
-struct hfi1_msix_entry;
-
-/* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *);
-/* Initialize driver affinity data */
-void hfi1_dev_affinity_init(struct hfi1_devdata *);
-/* Free driver affinity data */
-void hfi1_dev_affinity_free(struct hfi1_devdata *);
-/*
- * Set IRQ affinity to a CPU. The function will determine the
- * CPU and set the affinity to it.
- */
-int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
-/*
- * Remove the IRQ's CPU affinity. This function also updates
- * any internal CPU tracking data
- */
-void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
-/*
- * Determine a CPU affinity for a user process, if the process does not
- * have an affinity set yet.
- */
-int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
-/* Release a CPU used by a user process. */
-void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
-
-#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/staging/rdma/hfi1/aspm.h b/drivers/staging/rdma/hfi1/aspm.h
deleted file mode 100644 (file)
index 0d58fe3..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _ASPM_H
-#define _ASPM_H
-
-#include "hfi.h"
-
-extern uint aspm_mode;
-
-enum aspm_mode {
-       ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */
-       ASPM_MODE_ENABLED = 1,  /* ASPM always enabled, power saving mode */
-       ASPM_MODE_DYNAMIC = 2,  /* ASPM enabled/disabled dynamically */
-};
-
-/* Time after which the timer interrupt will re-enable ASPM */
-#define ASPM_TIMER_MS 1000
-/* Time for which interrupts are ignored after a timer has been scheduled */
-#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
-/* Two interrupts within this time trigger ASPM disable */
-#define ASPM_TRIGGER_MS 1
-#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
-#define ASPM_L1_SUPPORTED(reg) \
-       (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
-
-static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-       u32 up, dn;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return false;
-
-       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
-       dn = ASPM_L1_SUPPORTED(dn);
-
-       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
-       up = ASPM_L1_SUPPORTED(up);
-
-       /* ASPM works on A-step but is reported as not supported */
-       return (!!dn || is_ax(dd)) && !!up;
-}
-
-/* Set L1 entrance latency for slower entry to L1 */
-static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
-{
-       u32 l1_ent_lat = 0x4u;
-       u32 reg32;
-
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
-       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
-       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
-}
-
-static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return;
-
-       /* Enable ASPM L1 first in upstream component and then downstream */
-       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-}
-
-static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /* Disable ASPM L1 first in downstream component and then upstream */
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
-       if (parent)
-               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
-}
-
-static inline void aspm_enable(struct hfi1_devdata *dd)
-{
-       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
-           !dd->aspm_supported)
-               return;
-
-       aspm_hw_enable_l1(dd);
-       dd->aspm_enabled = true;
-}
-
-static inline void aspm_disable(struct hfi1_devdata *dd)
-{
-       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
-               return;
-
-       aspm_hw_disable_l1(dd);
-       dd->aspm_enabled = false;
-}
-
-static inline void aspm_disable_inc(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       aspm_disable(dd);
-       atomic_inc(&dd->aspm_disabled_cnt);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-static inline void aspm_enable_dec(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
-               aspm_enable(dd);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-/* ASPM processing for each receive context interrupt */
-static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
-{
-       bool restart_timer;
-       bool close_interrupts;
-       unsigned long flags;
-       ktime_t now, prev;
-
-       /* Quickest exit for minimum impact */
-       if (!rcd->aspm_intr_supported)
-               return;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       /* PSM contexts are open */
-       if (!rcd->aspm_intr_enable)
-               goto unlock;
-
-       prev = rcd->aspm_ts_last_intr;
-       now = ktime_get();
-       rcd->aspm_ts_last_intr = now;
-
-       /* An interrupt pair close together in time */
-       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
-
-       /* Don't push out our timer till this much time has elapsed */
-       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
-                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
-       restart_timer = restart_timer && close_interrupts;
-
-       /* Disable ASPM and schedule timer */
-       if (rcd->aspm_enabled && close_interrupts) {
-               aspm_disable_inc(rcd->dd);
-               rcd->aspm_enabled = false;
-               restart_timer = true;
-       }
-
-       if (restart_timer) {
-               mod_timer(&rcd->aspm_timer,
-                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
-               rcd->aspm_ts_timer_sched = now;
-       }
-unlock:
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Timer function for re-enabling ASPM in the absence of interrupt activity */
-static inline void aspm_ctx_timer_function(unsigned long data)
-{
-       struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       aspm_enable_dec(rcd->dd);
-       rcd->aspm_enabled = true;
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Disable interrupt processing for verbs contexts when PSM contexts are open */
-static inline void aspm_disable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       unsigned i;
-
-       for (i = 0; i < dd->first_user_ctxt; i++) {
-               rcd = dd->rcd[i];
-               del_timer_sync(&rcd->aspm_timer);
-               spin_lock_irqsave(&rcd->aspm_lock, flags);
-               rcd->aspm_intr_enable = false;
-               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-       }
-
-       aspm_disable(dd);
-       atomic_set(&dd->aspm_disabled_cnt, 0);
-}
-
-/* Re-enable interrupt processing for verbs contexts */
-static inline void aspm_enable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       unsigned i;
-
-       aspm_enable(dd);
-
-       if (aspm_mode != ASPM_MODE_DYNAMIC)
-               return;
-
-       for (i = 0; i < dd->first_user_ctxt; i++) {
-               rcd = dd->rcd[i];
-               spin_lock_irqsave(&rcd->aspm_lock, flags);
-               rcd->aspm_intr_enable = true;
-               rcd->aspm_enabled = true;
-               spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-       }
-}
-
-static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
-{
-       spin_lock_init(&rcd->aspm_lock);
-       setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function,
-                   (unsigned long)rcd);
-       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
-               aspm_mode == ASPM_MODE_DYNAMIC &&
-               rcd->ctxt < rcd->dd->first_user_ctxt;
-}
-
-static inline void aspm_init(struct hfi1_devdata *dd)
-{
-       unsigned i;
-
-       spin_lock_init(&dd->aspm_lock);
-       dd->aspm_supported = aspm_hw_l1_supported(dd);
-
-       for (i = 0; i < dd->first_user_ctxt; i++)
-               aspm_ctx_init(dd->rcd[i]);
-
-       /* Start with ASPM disabled */
-       aspm_hw_set_l1_ent_latency(dd);
-       dd->aspm_enabled = false;
-       aspm_hw_disable_l1(dd);
-
-       /* Now turn on ASPM if configured */
-       aspm_enable_all(dd);
-}
-
-static inline void aspm_exit(struct hfi1_devdata *dd)
-{
-       aspm_disable_all(dd);
-
-       /* Turn on ASPM on exit to conserve power */
-       aspm_enable(dd);
-}
-
-#endif /* _ASPM_H */
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
deleted file mode 100644 (file)
index dcae8e7..0000000
+++ /dev/null
@@ -1,14693 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains all of the code that is specific to the HFI chip
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "trace.h"
-#include "mad.h"
-#include "pio.h"
-#include "sdma.h"
-#include "eprom.h"
-#include "efivar.h"
-#include "platform.h"
-#include "aspm.h"
-
-#define NUM_IB_PORTS 1
-
-uint kdeth_qp;
-module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
-MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
-
-uint num_vls = HFI1_MAX_VLS_SUPPORTED;
-module_param(num_vls, uint, S_IRUGO);
-MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
-
-/*
- * Default time to aggregate two 10K packets from the idle state
- * (timer not running). The timer starts at the end of the first packet,
- * so only the time for one 10K packet and header plus a bit extra is needed.
- * 10 * 1024 + 64 header byte = 10304 byte
- * 10304 byte / 12.5 GB/s = 824.32ns
- */
-uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
-module_param(rcv_intr_timeout, uint, S_IRUGO);
-MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
-
-uint rcv_intr_count = 16; /* same as qib */
-module_param(rcv_intr_count, uint, S_IRUGO);
-MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
-
-ushort link_crc_mask = SUPPORTED_CRCS;
-module_param(link_crc_mask, ushort, S_IRUGO);
-MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
-
-uint loopback;
-module_param_named(loopback, loopback, uint, S_IRUGO);
-MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
-
-/* Other driver tunables */
-uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
-static ushort crc_14b_sideband = 1;
-static uint use_flr = 1;
-uint quick_linkup; /* skip LNI */
-
-struct flag_table {
-       u64 flag;       /* the flag */
-       char *str;      /* description string */
-       u16 extra;      /* extra information */
-       u16 unused0;
-       u32 unused1;
-};
-
-/* str must be a string constant */
-#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
-#define FLAG_ENTRY0(str, flag) {flag, str, 0}
-
-/* Send Error Consequences */
-#define SEC_WRITE_DROPPED      0x1
-#define SEC_PACKET_DROPPED     0x2
-#define SEC_SC_HALTED          0x4     /* per-context only */
-#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
-
-#define MIN_KERNEL_KCTXTS         2
-#define FIRST_KERNEL_KCTXT        1
-/* sizes for both the QP and RSM map tables */
-#define NUM_MAP_ENTRIES                256
-#define NUM_MAP_REGS             32
-
-/* Bit offset into the GUID which carries HFI id information */
-#define GUID_HFI_INDEX_SHIFT     39
-
-/* extract the emulation revision */
-#define emulator_rev(dd) ((dd)->irev >> 8)
-/* parallel and serial emulation versions are 3 and 4 respectively */
-#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
-#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
-
-/* RSM fields */
-
-/* packet type */
-#define IB_PACKET_TYPE         2ull
-#define QW_SHIFT               6ull
-/* QPN[7..1] */
-#define QPN_WIDTH              7ull
-
-/* LRH.BTH: QW 0, OFFSET 48 - for match */
-#define LRH_BTH_QW             0ull
-#define LRH_BTH_BIT_OFFSET     48ull
-#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
-#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
-#define LRH_BTH_SELECT
-#define LRH_BTH_MASK           3ull
-#define LRH_BTH_VALUE          2ull
-
-/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
-#define LRH_SC_QW              0ull
-#define LRH_SC_BIT_OFFSET      56ull
-#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
-#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
-#define LRH_SC_MASK            128ull
-#define LRH_SC_VALUE           0ull
-
-/* SC[n..0] QW 0, OFFSET 60 - for select */
-#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
-
-/* QPN[m+n:1] QW 1, OFFSET 1 */
-#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
-
-/* defines to build power on SC2VL table */
-#define SC2VL_VAL( \
-       num, \
-       sc0, sc0val, \
-       sc1, sc1val, \
-       sc2, sc2val, \
-       sc3, sc3val, \
-       sc4, sc4val, \
-       sc5, sc5val, \
-       sc6, sc6val, \
-       sc7, sc7val) \
-( \
-       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
-       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
-       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
-       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
-       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
-       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
-       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
-       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
-)
-
-#define DC_SC_VL_VAL( \
-       range, \
-       e0, e0val, \
-       e1, e1val, \
-       e2, e2val, \
-       e3, e3val, \
-       e4, e4val, \
-       e5, e5val, \
-       e6, e6val, \
-       e7, e7val, \
-       e8, e8val, \
-       e9, e9val, \
-       e10, e10val, \
-       e11, e11val, \
-       e12, e12val, \
-       e13, e13val, \
-       e14, e14val, \
-       e15, e15val) \
-( \
-       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
-       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
-       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
-       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
-       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
-       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
-       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
-       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
-       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
-       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
-       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
-       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
-       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
-       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
-       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
-       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
-)
-
-/* all CceStatus sub-block freeze bits */
-#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
-                       | CCE_STATUS_RXE_FROZE_SMASK \
-                       | CCE_STATUS_TXE_FROZE_SMASK \
-                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
-/* all CceStatus sub-block TXE pause bits */
-#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
-                       | CCE_STATUS_TXE_PAUSED_SMASK \
-                       | CCE_STATUS_SDMA_PAUSED_SMASK)
-/* all CceStatus sub-block RXE pause bits */
-#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
-
-/*
- * CCE Error flags.
- */
-static struct flag_table cce_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
-               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
-/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
-               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
-/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
-               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
-/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
-               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
-/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
-               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
-/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
-               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
-/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
-/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
-/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
-/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
-           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
-/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
-               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
-/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
-               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
-/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
-               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
-/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
-/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
-/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
-               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
-/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
-               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
-/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
-               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
-/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
-               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
-/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
-               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
-/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
-               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
-/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
-               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
-/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
-               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
-/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
-               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
-/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
-               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
-/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
-               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
-/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
-               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
-/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
-               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
-/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
-               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
-/*31*/ FLAG_ENTRY0("LATriggered",
-               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
-/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
-               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
-/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
-               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
-/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
-               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
-/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
-               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
-/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
-               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
-/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
-               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
-/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
-               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
-/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
-               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
-/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
-               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
-/*41-63 reserved*/
-};
-
-/*
- * Misc Error flags
- */
-#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
-static struct flag_table misc_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
-/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
-/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
-/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
-/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
-/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
-/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
-/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
-/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
-/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
-/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
-/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
-/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
-};
-
-/*
- * TXE PIO Error flags and consequences
- */
-static struct flag_table pio_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
-       SEC_WRITE_DROPPED,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
-/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
-/* 2*/ FLAG_ENTRY("PioCsrParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
-/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
-/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
-/* 5*/ FLAG_ENTRY("PioPccFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
-/* 6*/ FLAG_ENTRY("PioPecFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
-/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
-/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
-/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
-/*10*/ FLAG_ENTRY("PioSmPktResetParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
-/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
-/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
-/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
-/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
-/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
-/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
-/*17*/ FLAG_ENTRY("PioInitSmIn",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
-/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
-/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
-/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
-       0,
-       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
-/*21*/ FLAG_ENTRY("PioWriteDataParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
-/*22*/ FLAG_ENTRY("PioStateMachine",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
-/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
-       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
-/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
-       SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
-/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
-/*26*/ FLAG_ENTRY("PioVlfSopParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
-/*27*/ FLAG_ENTRY("PioVlFifoParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
-/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
-/*29*/ FLAG_ENTRY("PioPpmcSopLen",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
-/*30-31 reserved*/
-/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
-/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
-/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
-/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
-       SEC_SPC_FREEZE,
-       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
-/*36-63 reserved*/
-};
-
-/* TXE PIO errors that cause an SPC freeze */
-#define ALL_PIO_FREEZE_ERR \
-       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
-       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
-
-/*
- * TXE SDMA Error flags
- */
-static struct flag_table sdma_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
-               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
-/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
-               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
-/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
-               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
-/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
-               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
-/*04-63 reserved*/
-};
-
-/* TXE SDMA errors that cause an SPC freeze */
-#define ALL_SDMA_FREEZE_ERR  \
-               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
-               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
-               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
-
-/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
-#define PORT_DISCARD_EGRESS_ERRS \
-       (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
-       | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
-       | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
-
-/*
- * TXE Egress Error flags
- */
-#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
-static struct flag_table egress_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
-/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
-/* 2 reserved */
-/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
-               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
-/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
-/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
-/* 6 reserved */
-/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
-               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
-/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
-               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
-/* 9-10 reserved */
-/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
-               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
-/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
-/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
-/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
-/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
-/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
-               SEES(TX_SDMA0_DISALLOWED_PACKET)),
-/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
-               SEES(TX_SDMA1_DISALLOWED_PACKET)),
-/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
-               SEES(TX_SDMA2_DISALLOWED_PACKET)),
-/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
-               SEES(TX_SDMA3_DISALLOWED_PACKET)),
-/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
-               SEES(TX_SDMA4_DISALLOWED_PACKET)),
-/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
-               SEES(TX_SDMA5_DISALLOWED_PACKET)),
-/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
-               SEES(TX_SDMA6_DISALLOWED_PACKET)),
-/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
-               SEES(TX_SDMA7_DISALLOWED_PACKET)),
-/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
-               SEES(TX_SDMA8_DISALLOWED_PACKET)),
-/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
-               SEES(TX_SDMA9_DISALLOWED_PACKET)),
-/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
-               SEES(TX_SDMA10_DISALLOWED_PACKET)),
-/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
-               SEES(TX_SDMA11_DISALLOWED_PACKET)),
-/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
-               SEES(TX_SDMA12_DISALLOWED_PACKET)),
-/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
-               SEES(TX_SDMA13_DISALLOWED_PACKET)),
-/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
-               SEES(TX_SDMA14_DISALLOWED_PACKET)),
-/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
-               SEES(TX_SDMA15_DISALLOWED_PACKET)),
-/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
-/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
-/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
-/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
-/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
-/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
-/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
-/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
-/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
-               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
-/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
-/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
-/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
-/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
-/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
-/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
-/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
-/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
-/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
-/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
-/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
-/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
-/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
-/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
-/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
-/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
-/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
-/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
-/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
-/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
-/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
-/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
-               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
-/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
-               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
-};
-
-/*
- * TXE Egress Error Info flags
- */
-#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
-static struct flag_table egress_err_info_flags[] = {
-/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
-/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
-/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
-/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
-/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
-/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
-/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
-/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
-/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
-/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
-/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
-/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
-/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
-/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
-/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
-/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
-/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
-/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
-/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
-/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
-/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
-/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
-};
-
-/* TXE Egress errors that cause an SPC freeze */
-#define ALL_TXE_EGRESS_FREEZE_ERR \
-       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
-       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
-       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
-       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
-       | SEES(TX_LAUNCH_CSR_PARITY) \
-       | SEES(TX_SBRD_CTL_CSR_PARITY) \
-       | SEES(TX_CONFIG_PARITY) \
-       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
-       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
-       | SEES(TX_CREDIT_RETURN_PARITY))
-
-/*
- * TXE Send error flags
- */
-#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
-static struct flag_table send_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
-/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
-/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
-};
-
-/*
- * TXE Send Context Error flags and consequences
- */
-static struct flag_table sc_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY("InconsistentSop",
-               SEC_PACKET_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
-/* 1*/ FLAG_ENTRY("DisallowedPacket",
-               SEC_PACKET_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
-/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
-/* 3*/ FLAG_ENTRY("WriteOverflow",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
-/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
-               SEC_WRITE_DROPPED | SEC_SC_HALTED,
-               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
-/* 5-63 reserved*/
-};
-
-/*
- * RXE Receive Error flags
- */
-#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
-static struct flag_table rxe_err_status_flags[] = {
-/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
-/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
-/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
-/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
-/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
-/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
-/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
-/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
-/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
-/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
-/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
-/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
-/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
-/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
-/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
-/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
-/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
-               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
-/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
-/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
-/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
-               RXES(RBUF_BLOCK_LIST_READ_UNC)),
-/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
-               RXES(RBUF_BLOCK_LIST_READ_COR)),
-/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
-               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
-/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
-               RXES(RBUF_CSR_QENT_CNT_PARITY)),
-/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
-               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
-/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
-               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
-/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
-/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
-/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
-               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
-/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
-/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
-/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
-/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
-/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
-/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
-/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
-/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
-               RXES(RBUF_FL_INITDONE_PARITY)),
-/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
-               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
-/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
-/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
-/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
-/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
-               RXES(LOOKUP_DES_PART1_UNC_COR)),
-/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
-               RXES(LOOKUP_DES_PART2_PARITY)),
-/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
-/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
-/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
-/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
-/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
-/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
-/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
-/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
-/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
-/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
-/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
-/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
-/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
-/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
-/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
-/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
-/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
-/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
-/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
-/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
-/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
-/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
-};
-
-/* RXE errors that will trigger an SPC freeze */
-#define ALL_RXE_FREEZE_ERR  \
-       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
-       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
-
-#define RXE_FREEZE_ABORT_MASK \
-       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
-       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
-       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
-
-/*
- * DCC Error Flags
- */
-#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
-static struct flag_table dcc_err_flags[] = {
-       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
-       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
-       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
-       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
-       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
-       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
-       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
-       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
-       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
-       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
-       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
-       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
-       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
-       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
-       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
-       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
-       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
-       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
-       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
-       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
-       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
-       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
-       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
-       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
-       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
-       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
-       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
-       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
-       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
-       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
-       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
-       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
-       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
-       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
-       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
-       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
-       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
-       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
-       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
-       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
-       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
-       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
-       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
-       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
-       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
-       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
-};
-
-/*
- * LCB error flags
- */
-#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
-static struct flag_table lcb_err_flags[] = {
-/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
-/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
-/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
-/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
-               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
-/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
-/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
-/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
-/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
-/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
-/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
-/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
-/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
-/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
-/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
-               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
-/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
-/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
-/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
-/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
-/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
-/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
-               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
-/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
-/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
-/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
-/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
-/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
-/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
-/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
-               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
-/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
-/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
-               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
-/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
-               LCBE(REDUNDANT_FLIT_PARITY_ERR))
-};
-
-/*
- * DC8051 Error Flags
- */
-#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
-static struct flag_table dc8051_err_flags[] = {
-       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
-       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
-       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
-       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
-       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
-       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
-       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
-       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
-       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
-                   D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
-       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
-};
-
-/*
- * DC8051 Information Error flags
- *
- * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
- */
-static struct flag_table dc8051_info_err_flags[] = {
-       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
-       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
-       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
-       FLAG_ENTRY0("Serdes internal loopback failure",
-                   FAILED_SERDES_INTERNAL_LOOPBACK),
-       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
-       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
-       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
-       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
-       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
-       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
-       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
-       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
-       FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT)
-};
-
-/*
- * DC8051 Information Host Information flags
- *
- * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
- */
-static struct flag_table dc8051_info_host_msg_flags[] = {
-       FLAG_ENTRY0("Host request done", 0x0001),
-       FLAG_ENTRY0("BC SMA message", 0x0002),
-       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
-       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
-       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
-       FLAG_ENTRY0("External device config request", 0x0020),
-       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
-       FLAG_ENTRY0("LinkUp achieved", 0x0080),
-       FLAG_ENTRY0("Link going down", 0x0100),
-};
-
-static u32 encoded_size(u32 size);
-static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
-static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
-static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
-                              u8 *continuous);
-static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
-                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
-static void read_vc_remote_link_width(struct hfi1_devdata *dd,
-                                     u8 *remote_tx_rate, u16 *link_widths);
-static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
-                                    u8 *flag_bits, u16 *link_widths);
-static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
-                                 u8 *device_rev);
-static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
-static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
-static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
-                           u8 *tx_polarity_inversion,
-                           u8 *rx_polarity_inversion, u8 *max_rate);
-static void handle_sdma_eng_err(struct hfi1_devdata *dd,
-                               unsigned int context, u64 err_status);
-static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
-static void handle_dcc_err(struct hfi1_devdata *dd,
-                          unsigned int context, u64 err_status);
-static void handle_lcb_err(struct hfi1_devdata *dd,
-                          unsigned int context, u64 err_status);
-static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
-static void set_partition_keys(struct hfi1_pportdata *);
-static const char *link_state_name(u32 state);
-static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
-                                         u32 state);
-static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
-                          u64 *out_data);
-static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
-static int thermal_init(struct hfi1_devdata *dd);
-
-static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
-                                 int msecs);
-static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
-static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
-static void handle_temp_err(struct hfi1_devdata *);
-static void dc_shutdown(struct hfi1_devdata *);
-static void dc_start(struct hfi1_devdata *);
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
-                          unsigned int *np);
-
-/*
- * Error interrupt table entry.  This is used as input to the interrupt
- * "clear down" routine used for all second tier error interrupt register.
- * Second tier interrupt registers have a single bit representing them
- * in the top-level CceIntStatus.
- */
-struct err_reg_info {
-       u32 status;             /* status CSR offset */
-       u32 clear;              /* clear CSR offset */
-       u32 mask;               /* mask CSR offset */
-       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
-       const char *desc;
-};
-
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
-
-/*
- * Helpers for building HFI and DC error interrupt table entries.  Different
- * helpers are needed because of inconsistent register names.
- */
-#define EE(reg, handler, desc) \
-       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
-               handler, desc }
-#define DC_EE1(reg, handler, desc) \
-       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
-#define DC_EE2(reg, handler, desc) \
-       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
-
-/*
- * Table of the "misc" grouping of error interrupts.  Each entry refers to
- * another register containing more information.
- */
-static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
-/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
-/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
-/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
-/* 3*/ { 0, 0, 0, NULL }, /* reserved */
-/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
-/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
-/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
-/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
-       /* the rest are reserved */
-};
-
-/*
- * Index into the Various section of the interrupt sources
- * corresponding to the Critical Temperature interrupt.
- */
-#define TCRIT_INT_SOURCE 4
-
-/*
- * SDMA error interrupt entry - refers to another register containing more
- * information.
- */
-static const struct err_reg_info sdma_eng_err =
-       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
-
-static const struct err_reg_info various_err[NUM_VARIOUS] = {
-/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
-/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
-/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
-/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
-/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
-       /* rest are reserved */
-};
-
-/*
- * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
- * register can not be derived from the MTU value because 10K is not
- * a power of 2. Therefore, we need a constant. Everything else can
- * be calculated.
- */
-#define DCC_CFG_PORT_MTU_CAP_10240 7
-
-/*
- * Table of the DC grouping of error interrupts.  Each entry refers to
- * another register containing more information.
- */
-static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
-/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
-/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
-/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
-/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
-       /* the rest are reserved */
-};
-
-struct cntr_entry {
-       /*
-        * counter name
-        */
-       char *name;
-
-       /*
-        * csr to read for name (if applicable)
-        */
-       u64 csr;
-
-       /*
-        * offset into dd or ppd to store the counter's value
-        */
-       int offset;
-
-       /*
-        * flags
-        */
-       u8 flags;
-
-       /*
-        * accessor for stat element, context either dd or ppd
-        */
-       u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
-                      int mode, u64 data);
-};
-
-#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
-#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
-
-#define CNTR_ELEM(name, csr, offset, flags, accessor) \
-{ \
-       name, \
-       csr, \
-       offset, \
-       flags, \
-       accessor \
-}
-
-/* 32bit RXE */
-#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         port_access_u32_csr)
-
-#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-/* 64bit RXE */
-#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY64), \
-         0, flags, \
-         port_access_u64_csr)
-
-#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + RCV_COUNTER_ARRAY64), \
-         0, flags, \
-         dev_access_u64_csr)
-
-#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
-#define OVR_ELM(ctx) \
-CNTR_ELEM("RcvHdrOvr" #ctx, \
-         (RCV_HDR_OVFL_CNT + ctx * 0x100), \
-         0, CNTR_NORMAL, port_access_u64_csr)
-
-/* 32bit TXE */
-#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + SEND_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         port_access_u32_csr)
-
-/* 64bit TXE */
-#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + SEND_COUNTER_ARRAY64), \
-         0, flags, \
-         port_access_u64_csr)
-
-# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name,\
-         counter * 8 + SEND_COUNTER_ARRAY64, \
-         0, \
-         flags, \
-         dev_access_u64_csr)
-
-/* CCE */
-#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + CCE_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
-CNTR_ELEM(#name, \
-         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
-         0, flags | CNTR_32BIT, \
-         dev_access_u32_csr)
-
-/* DC */
-#define DC_PERF_CNTR(name, counter, flags) \
-CNTR_ELEM(#name, \
-         counter, \
-         0, \
-         flags, \
-         dev_access_u64_csr)
-
-#define DC_PERF_CNTR_LCB(name, counter, flags) \
-CNTR_ELEM(#name, \
-         counter, \
-         0, \
-         flags, \
-         dc_access_lcb_cntr)
-
-/* ibp counters */
-#define SW_IBP_CNTR(name, cntr) \
-CNTR_ELEM(#name, \
-         0, \
-         0, \
-         CNTR_SYNTH, \
-         access_ibp_##cntr)
-
-u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
-{
-       if (dd->flags & HFI1_PRESENT) {
-               return readq((void __iomem *)dd->kregbase + offset);
-       }
-       return -1;
-}
-
-void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
-{
-       if (dd->flags & HFI1_PRESENT)
-               writeq(value, (void __iomem *)dd->kregbase + offset);
-}
-
-void __iomem *get_csr_addr(
-       struct hfi1_devdata *dd,
-       u32 offset)
-{
-       return (void __iomem *)dd->kregbase + offset;
-}
-
-static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
-                                int mode, u64 value)
-{
-       u64 ret;
-
-       if (mode == CNTR_MODE_R) {
-               ret = read_csr(dd, csr);
-       } else if (mode == CNTR_MODE_W) {
-               write_csr(dd, csr, value);
-               ret = value;
-       } else {
-               dd_dev_err(dd, "Invalid cntr register access mode");
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
-       return ret;
-}
-
-/* Dev Access */
-static u64 dev_access_u32_csr(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_SDMA) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 0x100 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-       return read_write_csr(dd, csr, mode, data);
-}
-
-static u64 access_sde_err_cnt(const struct cntr_entry *entry,
-                             void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].err_cnt;
-       return 0;
-}
-
-static u64 access_sde_int_cnt(const struct cntr_entry *entry,
-                             void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].sdma_int_cnt;
-       return 0;
-}
-
-static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
-                                  void *context, int idx, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].idle_int_cnt;
-       return 0;
-}
-
-static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
-                                      void *context, int idx, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       if (dd->per_sdma && idx < dd->num_sdma)
-               return dd->per_sdma[idx].progress_int_cnt;
-       return 0;
-}
-
-static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
-                             int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       u64 val = 0;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_VL) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 8 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-
-       val = read_write_csr(dd, csr, mode, data);
-       return val;
-}
-
-static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
-                             int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-       u32 csr = entry->csr;
-       int ret = 0;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       if (mode == CNTR_MODE_R)
-               ret = read_lcb_csr(dd, csr, &data);
-       else if (mode == CNTR_MODE_W)
-               ret = write_lcb_csr(dd, csr, data);
-
-       if (ret) {
-               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
-       return data;
-}
-
-/* Port Access */
-static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
-                              int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_csr(ppd->dd, entry->csr, mode, data);
-}
-
-static u64 port_access_u64_csr(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-       u64 val;
-       u64 csr = entry->csr;
-
-       if (entry->flags & CNTR_VL) {
-               if (vl == CNTR_INVALID_VL)
-                       return 0;
-               csr += 8 * vl;
-       } else {
-               if (vl != CNTR_INVALID_VL)
-                       return 0;
-       }
-       val = read_write_csr(ppd->dd, csr, mode, data);
-       return val;
-}
-
-/* Software defined */
-static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
-                               u64 data)
-{
-       u64 ret;
-
-       if (mode == CNTR_MODE_R) {
-               ret = *cntr;
-       } else if (mode == CNTR_MODE_W) {
-               *cntr = data;
-               ret = data;
-       } else {
-               dd_dev_err(dd, "Invalid cntr sw access mode");
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
-
-       return ret;
-}
-
-static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
-                                int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
-}
-
-static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
-                                int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
-}
-
-static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-       return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
-}
-
-static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
-       u64 zero = 0;
-       u64 *counter;
-
-       if (vl == CNTR_INVALID_VL)
-               counter = &ppd->port_xmit_discards;
-       else if (vl >= 0 && vl < C_VL_COUNT)
-               counter = &ppd->port_xmit_discards_vl[vl];
-       else
-               counter = &zero;
-
-       return read_write_sw(ppd->dd, counter, mode, data);
-}
-
-static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
-                            mode, data);
-}
-
-static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_pportdata *ppd = context;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
-                            mode, data);
-}
-
-u64 get_all_cpu_total(u64 __percpu *cntr)
-{
-       int cpu;
-       u64 counter = 0;
-
-       for_each_possible_cpu(cpu)
-               counter += *per_cpu_ptr(cntr, cpu);
-       return counter;
-}
-
-static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
-                         u64 __percpu *cntr,
-                         int vl, int mode, u64 data)
-{
-       u64 ret = 0;
-
-       if (vl != CNTR_INVALID_VL)
-               return 0;
-
-       if (mode == CNTR_MODE_R) {
-               ret = get_all_cpu_total(cntr) - *z_val;
-       } else if (mode == CNTR_MODE_W) {
-               /* A write can only zero the counter */
-               if (data == 0)
-                       *z_val = get_all_cpu_total(cntr);
-               else
-                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
-       } else {
-               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
-               return 0;
-       }
-
-       return ret;
-}
-
-static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
-                             mode, data);
-}
-
-static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
-                             mode, data);
-}
-
-static u64 access_sw_pio_wait(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_piowait;
-}
-
-static u64 access_sw_pio_drain(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->verbs_dev.n_piodrain;
-}
-
-static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
-                             void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_txwait;
-}
-
-static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
-                              void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = context;
-
-       return dd->verbs_dev.n_kmem_wait;
-}
-
-static u64 access_sw_send_schedule(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
-                             mode, data);
-}
-
-/* Software counters for the error status bits within MISC_ERR_STATUS */
-static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[12];
-}
-
-static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[11];
-}
-
-static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[10];
-}
-
-static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[9];
-}
-
-static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[8];
-}
-
-static u64 access_misc_efuse_read_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[7];
-}
-
-static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[6];
-}
-
-static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[5];
-}
-
-static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[4];
-}
-
-static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[3];
-}
-
-static u64 access_misc_csr_write_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[2];
-}
-
-static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[1];
-}
-
-static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->misc_err_status_cnt[0];
-}
-
-/*
- * Software counter for the aggregate of
- * individual CceErrStatus counters
- */
-static u64 access_sw_cce_err_status_aggregated_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_cce_err_status_aggregate;
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within CceErrStatus
- */
-static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[40];
-}
-
-static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[39];
-}
-
-static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[38];
-}
-
-static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[37];
-}
-
-static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[36];
-}
-
-static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[35];
-}
-
-static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[34];
-}
-
-static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[33];
-}
-
-static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl, int mode,
-                                               u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[32];
-}
-
-static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
-                                  void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[31];
-}
-
-static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[30];
-}
-
-static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[29];
-}
-
-static u64 access_pcic_transmit_back_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[28];
-}
-
-static u64 access_pcic_transmit_front_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[27];
-}
-
-static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[26];
-}
-
-static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[25];
-}
-
-static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[24];
-}
-
-static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[23];
-}
-
-static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[22];
-}
-
-static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[21];
-}
-
-static u64 access_pcic_n_post_dat_q_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[20];
-}
-
-static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[19];
-}
-
-static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[18];
-}
-
-static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[17];
-}
-
-static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[16];
-}
-
-static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[15];
-}
-
-static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[14];
-}
-
-static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[13];
-}
-
-static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[12];
-}
-
-static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[11];
-}
-
-static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[10];
-}
-
-static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[9];
-}
-
-static u64 access_cce_cli2_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[8];
-}
-
-static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[7];
-}
-
-static u64 access_cce_cli0_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[6];
-}
-
-static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[5];
-}
-
-static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[4];
-}
-
-static u64 access_cce_trgt_async_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[3];
-}
-
-static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[2];
-}
-
-static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[1];
-}
-
-static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->cce_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within RcvErrStatus
- */
-static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[63];
-}
-
-static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[62];
-}
-
-static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[61];
-}
-
-static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[60];
-}
-
-static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[59];
-}
-
-static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[58];
-}
-
-static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[57];
-}
-
-static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[56];
-}
-
-static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[55];
-}
-
-static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[54];
-}
-
-static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[53];
-}
-
-static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[52];
-}
-
-static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[51];
-}
-
-static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[50];
-}
-
-static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[49];
-}
-
-static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[48];
-}
-
-static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[47];
-}
-
-static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[46];
-}
-
-static u64 access_rx_hq_intr_csr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[45];
-}
-
-static u64 access_rx_lookup_csr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[44];
-}
-
-static u64 access_rx_lookup_rcv_array_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[43];
-}
-
-static u64 access_rx_lookup_rcv_array_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[42];
-}
-
-static u64 access_rx_lookup_des_part2_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[41];
-}
-
-static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[40];
-}
-
-static u64 access_rx_lookup_des_part1_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[39];
-}
-
-static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[38];
-}
-
-static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[37];
-}
-
-static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[36];
-}
-
-static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[35];
-}
-
-static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[34];
-}
-
-static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[33];
-}
-
-static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[32];
-}
-
-static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[31];
-}
-
-static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[30];
-}
-
-static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[29];
-}
-
-static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[28];
-}
-
-static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[27];
-}
-
-static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[26];
-}
-
-static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[25];
-}
-
-static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[24];
-}
-
-static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[23];
-}
-
-static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[22];
-}
-
-static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[21];
-}
-
-static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[20];
-}
-
-static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[19];
-}
-
-static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[18];
-}
-
-static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[17];
-}
-
-static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[16];
-}
-
-static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[15];
-}
-
-static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[14];
-}
-
-static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[13];
-}
-
-static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[12];
-}
-
-static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[11];
-}
-
-static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[10];
-}
-
-static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[9];
-}
-
-static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[8];
-}
-
-static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[7];
-}
-
-static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[6];
-}
-
-static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[5];
-}
-
-static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[4];
-}
-
-static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[3];
-}
-
-static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[2];
-}
-
-static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[1];
-}
-
-static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->rcv_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendPioErrStatus
- */
-static u64 access_pio_pec_sop_head_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[35];
-}
-
-static u64 access_pio_pcc_sop_head_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[34];
-}
-
-static u64 access_pio_last_returned_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[33];
-}
-
-static u64 access_pio_current_free_cnt_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[32];
-}
-
-static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[31];
-}
-
-static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[30];
-}
-
-static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[29];
-}
-
-static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[28];
-}
-
-static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[27];
-}
-
-static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[26];
-}
-
-static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[25];
-}
-
-static u64 access_pio_block_qw_count_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[24];
-}
-
-static u64 access_pio_write_qw_valid_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[23];
-}
-
-static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[22];
-}
-
-static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[21];
-}
-
-static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[20];
-}
-
-static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[19];
-}
-
-static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[18];
-}
-
-static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[17];
-}
-
-static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[16];
-}
-
-static u64 access_pio_credit_ret_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[15];
-}
-
-static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[14];
-}
-
-static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[13];
-}
-
-static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[12];
-}
-
-static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[11];
-}
-
-static u64 access_pio_sm_pkt_reset_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[10];
-}
-
-static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[9];
-}
-
-static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[8];
-}
-
-static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[7];
-}
-
-static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[6];
-}
-
-static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[5];
-}
-
-static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[4];
-}
-
-static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[3];
-}
-
-static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                        void *context, int vl, int mode,
-                                        u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[2];
-}
-
-static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[1];
-}
-
-static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_pio_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendDmaErrStatus
- */
-static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[3];
-}
-
-static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[2];
-}
-
-static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[1];
-}
-
-static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_dma_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendEgressErrStatus
- */
-static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[63];
-}
-
-static u64 access_tx_read_sdma_memory_csr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[62];
-}
-
-static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[61];
-}
-
-static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[60];
-}
-
-static u64 access_tx_read_sdma_memory_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[59];
-}
-
-static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[58];
-}
-
-static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[57];
-}
-
-static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[56];
-}
-
-static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[55];
-}
-
-static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[54];
-}
-
-static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[53];
-}
-
-static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[52];
-}
-
-static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[51];
-}
-
-static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[50];
-}
-
-static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[49];
-}
-
-static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[48];
-}
-
-static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[47];
-}
-
-static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[46];
-}
-
-static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[45];
-}
-
-static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[44];
-}
-
-static u64 access_tx_read_sdma_memory_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[43];
-}
-
-static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[42];
-}
-
-static u64 access_tx_credit_return_partiy_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[41];
-}
-
-static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[40];
-}
-
-static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[39];
-}
-
-static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[38];
-}
-
-static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[37];
-}
-
-static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[36];
-}
-
-static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[35];
-}
-
-static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[34];
-}
-
-static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[33];
-}
-
-static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[32];
-}
-
-static u64 access_tx_sdma15_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[31];
-}
-
-static u64 access_tx_sdma14_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[30];
-}
-
-static u64 access_tx_sdma13_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[29];
-}
-
-static u64 access_tx_sdma12_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[28];
-}
-
-static u64 access_tx_sdma11_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[27];
-}
-
-static u64 access_tx_sdma10_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[26];
-}
-
-static u64 access_tx_sdma9_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[25];
-}
-
-static u64 access_tx_sdma8_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[24];
-}
-
-static u64 access_tx_sdma7_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[23];
-}
-
-static u64 access_tx_sdma6_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[22];
-}
-
-static u64 access_tx_sdma5_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[21];
-}
-
-static u64 access_tx_sdma4_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[20];
-}
-
-static u64 access_tx_sdma3_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[19];
-}
-
-static u64 access_tx_sdma2_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[18];
-}
-
-static u64 access_tx_sdma1_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[17];
-}
-
-static u64 access_tx_sdma0_disallowed_packet_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[16];
-}
-
-static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
-                                          void *context, int vl, int mode,
-                                          u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[15];
-}
-
-static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[14];
-}
-
-static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[13];
-}
-
-static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[12];
-}
-
-static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[11];
-}
-
-static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[10];
-}
-
-static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[9];
-}
-
-static u64 access_tx_sdma_launch_intf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[8];
-}
-
-static u64 access_tx_pio_launch_intf_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[7];
-}
-
-static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[6];
-}
-
-static u64 access_tx_incorrect_link_state_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[5];
-}
-
-static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode,
-                                     u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[4];
-}
-
-static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[3];
-}
-
-static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[2];
-}
-
-static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[1];
-}
-
-static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_egress_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendErrStatus
- */
-static u64 access_send_csr_write_bad_addr_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[2];
-}
-
-static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
-                                                void *context, int vl,
-                                                int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[1];
-}
-
-static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
-                                     void *context, int vl, int mode,
-                                     u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->send_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendCtxtErrStatus
- */
-static u64 access_pio_write_out_of_bounds_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[4];
-}
-
-static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[3];
-}
-
-static u64 access_pio_write_crosses_boundary_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[2];
-}
-
-static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
-                                               void *context, int vl,
-                                               int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[1];
-}
-
-static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl, int mode,
-                                              u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_ctxt_err_status_cnt[0];
-}
-
-/*
- * Software counters corresponding to each of the
- * error status bits within SendDmaEngErrStatus
- */
-static u64 access_sdma_header_request_fifo_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[23];
-}
-
-static u64 access_sdma_header_storage_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[22];
-}
-
-static u64 access_sdma_packet_tracking_cor_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[21];
-}
-
-static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[20];
-}
-
-static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[19];
-}
-
-static u64 access_sdma_header_request_fifo_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[18];
-}
-
-static u64 access_sdma_header_storage_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[17];
-}
-
-static u64 access_sdma_packet_tracking_unc_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[16];
-}
-
-static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[15];
-}
-
-static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[14];
-}
-
-static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
-                                      void *context, int vl, int mode,
-                                      u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[13];
-}
-
-static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[12];
-}
-
-static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
-                                             void *context, int vl, int mode,
-                                             u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[11];
-}
-
-static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
-                                            void *context, int vl, int mode,
-                                            u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[10];
-}
-
-static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[9];
-}
-
-static u64 access_sdma_packet_desc_overflow_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[8];
-}
-
-static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
-                                              void *context, int vl,
-                                              int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[7];
-}
-
-static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
-                                   void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[6];
-}
-
-static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[5];
-}
-
-static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
-                                         void *context, int vl, int mode,
-                                         u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[4];
-}
-
-static u64 access_sdma_tail_out_of_bounds_err_cnt(
-                               const struct cntr_entry *entry,
-                               void *context, int vl, int mode, u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[3];
-}
-
-static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[2];
-}
-
-static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
-                                           void *context, int vl, int mode,
-                                           u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[1];
-}
-
-static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
-                                       void *context, int vl, int mode,
-                                       u64 data)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
-
-       return dd->sw_send_dma_eng_err_status_cnt[0];
-}
-
-#define def_access_sw_cpu(cntr) \
-static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
-                             void *context, int vl, int mode, u64 data)      \
-{                                                                            \
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
-       return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,       \
-                             ppd->ibport_data.rvp.cntr, vl,                  \
-                             mode, data);                                    \
-}
-
-def_access_sw_cpu(rc_acks);
-def_access_sw_cpu(rc_qacks);
-def_access_sw_cpu(rc_delayed_comp);
-
-#define def_access_ibp_counter(cntr) \
-static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
-                               void *context, int vl, int mode, u64 data)    \
-{                                                                            \
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
-                                                                             \
-       if (vl != CNTR_INVALID_VL)                                            \
-               return 0;                                                     \
-                                                                             \
-       return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,        \
-                            mode, data);                                     \
-}
-
-def_access_ibp_counter(loop_pkts);
-def_access_ibp_counter(rc_resends);
-def_access_ibp_counter(rnr_naks);
-def_access_ibp_counter(other_naks);
-def_access_ibp_counter(rc_timeouts);
-def_access_ibp_counter(pkt_drops);
-def_access_ibp_counter(dmawait);
-def_access_ibp_counter(rc_seqnak);
-def_access_ibp_counter(rc_dupreq);
-def_access_ibp_counter(rdma_seq);
-def_access_ibp_counter(unaligned);
-def_access_ibp_counter(seq_naks);
-
-static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
-[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
-[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
-                       CNTR_NORMAL),
-[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
-                       CNTR_NORMAL),
-[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
-                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
-                       CNTR_NORMAL),
-[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
-                       CNTR_NORMAL),
-[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
-                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
-[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
-                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
-[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
-                       CNTR_NORMAL),
-[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
-                       CNTR_NORMAL),
-[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
-                       CNTR_NORMAL),
-[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
-                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
-[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
-                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
-[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
-                             CNTR_SYNTH),
-[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
-[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
-                                CNTR_SYNTH),
-[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
-                                 CNTR_SYNTH),
-[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
-                                 CNTR_SYNTH),
-[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
-                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
-[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
-                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
-                                 CNTR_SYNTH),
-[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
-                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
-[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
-                              CNTR_SYNTH),
-[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
-                             CNTR_SYNTH),
-[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
-                              CNTR_SYNTH),
-[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
-                                CNTR_SYNTH),
-[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
-                               CNTR_SYNTH),
-[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
-                               CNTR_SYNTH),
-[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
-                              CNTR_SYNTH),
-[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
-                               CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
-[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
-[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
-                             CNTR_SYNTH),
-[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
-                                CNTR_SYNTH | CNTR_VL),
-[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
-                               CNTR_SYNTH),
-[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
-                                  CNTR_SYNTH | CNTR_VL),
-[C_DC_TOTAL_CRC] =
-       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
-                        CNTR_SYNTH),
-[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
-                                 CNTR_SYNTH),
-[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
-                                 CNTR_SYNTH),
-[C_DC_CRC_MULT_LN] =
-       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
-                        CNTR_SYNTH),
-[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
-                                   CNTR_SYNTH),
-[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
-                                   CNTR_SYNTH),
-[C_DC_SEQ_CRC_CNT] =
-       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_ONLY_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_PLUS1_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
-                        CNTR_SYNTH),
-[C_DC_ESC0_PLUS2_CNT] =
-       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
-                        CNTR_SYNTH),
-[C_DC_REINIT_FROM_PEER_CNT] =
-       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
-                        CNTR_SYNTH),
-[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
-                                 CNTR_SYNTH),
-[C_DC_MISC_FLG_CNT] =
-       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
-                        CNTR_SYNTH),
-[C_DC_PRF_GOOD_LTP_CNT] =
-       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
-[C_DC_PRF_ACCEPTED_LTP_CNT] =
-       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
-                        CNTR_SYNTH),
-[C_DC_PRF_RX_FLIT_CNT] =
-       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
-[C_DC_PRF_TX_FLIT_CNT] =
-       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
-[C_DC_PRF_CLK_CNTR] =
-       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
-[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
-       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
-[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
-       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
-                        CNTR_SYNTH),
-[C_DC_PG_STS_TX_SBE_CNT] =
-       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
-[C_DC_PG_STS_TX_MBE_CNT] =
-       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
-                        CNTR_SYNTH),
-[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
-                           access_sw_cpu_intr),
-[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
-                           access_sw_cpu_rcv_limit),
-[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
-                           access_sw_vtx_wait),
-[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
-                           access_sw_pio_wait),
-[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
-                           access_sw_pio_drain),
-[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
-                           access_sw_kmem_wait),
-[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
-                           access_sw_send_schedule),
-[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
-                                     SEND_DMA_DESC_FETCHED_CNT, 0,
-                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                     dev_access_u32_csr),
-[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
-                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                            access_sde_int_cnt),
-[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
-                            CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                            access_sde_err_cnt),
-[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
-                                 CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                 access_sde_idle_int_cnt),
-[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
-                                     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
-                                     access_sde_progress_int_cnt),
-/* MISC_ERR_STATUS */
-[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_pll_lock_fail_err_cnt),
-[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_mbist_fail_err_cnt),
-[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_invalid_eep_cmd_err_cnt),
-[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_done_parity_err_cnt),
-[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_write_err_cnt),
-[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
-                               0, CNTR_NORMAL,
-                               access_misc_efuse_read_bad_addr_err_cnt),
-[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_efuse_csr_parity_err_cnt),
-[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_fw_auth_failed_err_cnt),
-[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_key_mismatch_err_cnt),
-[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_sbus_write_failed_err_cnt),
-[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_write_bad_addr_err_cnt),
-[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_read_bad_addr_err_cnt),
-[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
-                               CNTR_NORMAL,
-                               access_misc_csr_parity_err_cnt),
-/* CceErrStatus */
-[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
-                               CNTR_NORMAL,
-                               access_sw_cce_err_status_aggregated_cnt),
-[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_csr_parity_err_cnt),
-[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_int_map_unc_err_cnt),
-[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_int_map_cor_err_cnt),
-[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_table_unc_err_cnt),
-[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_msix_table_cor_err_cnt),
-[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_cce_rxdma_conv_fifo_parity_err_cnt),
-[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_cce_rcpl_async_fifo_parity_err_cnt),
-[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_seg_write_bad_addr_err_cnt),
-[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_seg_read_bad_addr_err_cnt),
-[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
-                               CNTR_NORMAL,
-                               access_la_triggered_cnt),
-[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_trgt_cpl_timeout_err_cnt),
-[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_receive_parity_err_cnt),
-[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_transmit_back_parity_err_cnt),
-[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
-                               0, CNTR_NORMAL,
-                               access_pcic_transmit_front_parity_err_cnt),
-[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_dat_q_unc_err_cnt),
-[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_hd_q_unc_err_cnt),
-[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_dat_q_unc_err_cnt),
-[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_hd_q_unc_err_cnt),
-[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_sot_mem_unc_err_cnt),
-[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_mem_unc_err),
-[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_n_post_dat_q_parity_err_cnt),
-[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_n_post_h_q_parity_err_cnt),
-[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_dat_q_cor_err_cnt),
-[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_cpl_hd_q_cor_err_cnt),
-[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_dat_q_cor_err_cnt),
-[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_post_hd_q_cor_err_cnt),
-[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_sot_mem_cor_err_cnt),
-[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
-                               CNTR_NORMAL,
-                               access_pcic_retry_mem_cor_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
-                               "CceCli1AsyncFifoDbgParityError", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_cli1_async_fifo_dbg_parity_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
-                               "CceCli1AsyncFifoRxdmaParityError", 0, 0,
-                               CNTR_NORMAL,
-                               access_cce_cli1_async_fifo_rxdma_parity_err_cnt
-                               ),
-[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
-                       "CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
-[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
-                       "CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
-[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_cli2_async_fifo_parity_err_cnt),
-[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_cfg_bus_parity_err_cnt),
-[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_cli0_async_fifo_parity_err_cnt),
-[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_rspd_data_parity_err_cnt),
-[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_trgt_access_err_cnt),
-[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_cce_trgt_async_fifo_parity_err_cnt),
-[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_write_bad_addr_err_cnt),
-[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_cce_csr_read_bad_addr_err_cnt),
-[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_ccs_csr_parity_err_cnt),
-
-/* RcvErrStatus */
-[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_parity_err_cnt),
-[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_write_bad_addr_err_cnt),
-[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_csr_read_bad_addr_err_cnt),
-[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_unc_err_cnt),
-[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_dq_fsm_encoding_err_cnt),
-[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_eq_fsm_encoding_err_cnt),
-[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_parity_err_cnt),
-[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_data_cor_err_cnt),
-[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_data_unc_err_cnt),
-[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_data_fifo_rd_cor_err_cnt),
-[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_data_fifo_rd_unc_err_cnt),
-[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_hdr_fifo_rd_cor_err_cnt),
-[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_hdr_fifo_rd_unc_err_cnt),
-[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part2_cor_err_cnt),
-[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part2_unc_err_cnt),
-[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part1_cor_err_cnt),
-[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_desc_part1_unc_err_cnt),
-[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_hq_intr_fsm_err_cnt),
-[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_hq_intr_csr_parity_err_cnt),
-[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_csr_parity_err_cnt),
-[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_rcv_array_cor_err_cnt),
-[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_rcv_array_unc_err_cnt),
-[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_lookup_des_part2_parity_err_cnt),
-[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_lookup_des_part1_unc_cor_err_cnt),
-[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_lookup_des_part1_unc_err_cnt),
-[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_next_free_buf_cor_err_cnt),
-[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_next_free_buf_unc_err_cnt),
-[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufFlInitWrAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_fl_init_wr_addr_parity_err_cnt),
-[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_fl_initdone_parity_err_cnt),
-[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_fl_write_addr_parity_err_cnt),
-[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_fl_rd_addr_parity_err_cnt),
-[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_empty_err_cnt),
-[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_full_err_cnt),
-[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_bad_lookup_err_cnt),
-[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_ctx_id_parity_err_cnt),
-[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rbuf_csr_qeopdw_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQNumOfPktParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQTlPtrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
-[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
-                       "RxRbufCsrQHeadBufNumParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
-[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_block_list_read_cor_err_cnt),
-[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_rx_rbuf_block_list_read_unc_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_cor_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_unc_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
-                       "RxRbufLookupDesRegUncCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
-[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_lookup_des_reg_unc_err_cnt),
-[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_free_list_cor_err_cnt),
-[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rbuf_free_list_unc_err_cnt),
-[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_fsm_encoding_err_cnt),
-[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_flag_cor_err_cnt),
-[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_flag_unc_err_cnt),
-[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dc_sop_eop_parity_err_cnt),
-[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_csr_parity_err_cnt),
-[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_qp_map_table_cor_err_cnt),
-[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_qp_map_table_unc_err_cnt),
-[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_data_cor_err_cnt),
-[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_data_unc_err_cnt),
-[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_hdr_cor_err_cnt),
-[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_rcv_hdr_unc_err_cnt),
-[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dc_intf_parity_err_cnt),
-[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_rx_dma_csr_cor_err_cnt),
-/* SendPioErrStatus */
-[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pec_sop_head_parity_err_cnt),
-[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pcc_sop_head_parity_err_cnt),
-[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_pio_last_returned_cnt_parity_err_cnt),
-[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_pio_current_free_cnt_parity_err_cnt),
-[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_reserved_31_err_cnt),
-[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_reserved_30_err_cnt),
-[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_sop_len_err_cnt),
-[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_bqc_mem_parity_err_cnt),
-[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vl_fifo_parity_err_cnt),
-[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vlf_sop_parity_err_cnt),
-[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_vlf_v1_len_parity_err_cnt),
-[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_block_qw_count_parity_err_cnt),
-[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_qw_valid_parity_err_cnt),
-[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_state_machine_err_cnt),
-[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_data_parity_err_cnt),
-[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_host_addr_mem_cor_err_cnt),
-[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_host_addr_mem_unc_err_cnt),
-[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
-[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_init_sm_in_err_cnt),
-[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_ppmc_pbl_fifo_err_cnt),
-[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
-                       0, CNTR_NORMAL,
-                       access_pio_credit_ret_fifo_parity_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank1_cor_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank0_cor_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank1_unc_err_cnt),
-[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_v1_len_mem_bank0_unc_err_cnt),
-[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sm_pkt_reset_parity_err_cnt),
-[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pkt_evict_fifo_parity_err_cnt),
-[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
-                       "PioSbrdctrlCrrelFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
-[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sbrdctl_crrel_parity_err_cnt),
-[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pec_fifo_parity_err_cnt),
-[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_pcc_fifo_parity_err_cnt),
-[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sb_mem_fifo1_err_cnt),
-[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_sb_mem_fifo0_err_cnt),
-[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_csr_parity_err_cnt),
-[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_addr_parity_err_cnt),
-[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_bad_ctxt_err_cnt),
-/* SendDmaErrStatus */
-[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
-                       0, CNTR_NORMAL,
-                       access_sdma_pcie_req_tracking_cor_err_cnt),
-[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_sdma_pcie_req_tracking_unc_err_cnt),
-[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_csr_parity_err_cnt),
-[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_rpy_tag_err_cnt),
-/* SendEgressErrStatus */
-[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_csr_unc_err_cnt),
-[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
-                       0, CNTR_NORMAL,
-                       access_tx_read_sdma_memory_csr_err_cnt),
-[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifo_cor_err_cnt),
-[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_cor_err_cnt),
-[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_sdma_memory_cor_err_cnt),
-[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sb_hdr_cor_err_cnt),
-[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_overrun_err_cnt),
-[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo8_cor_err_cnt),
-[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo7_cor_err_cnt),
-[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo6_cor_err_cnt),
-[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo5_cor_err_cnt),
-[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo4_cor_err_cnt),
-[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo3_cor_err_cnt),
-[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo2_cor_err_cnt),
-[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo1_cor_err_cnt),
-[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_fifo0_cor_err_cnt),
-[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_return_vl_err_cnt),
-[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_hcrc_insertion_err_cnt),
-[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifo_unc_err_cnt),
-[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_pio_memory_unc_err_cnt),
-[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_read_sdma_memory_unc_err_cnt),
-[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sb_hdr_unc_err_cnt),
-[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_credit_return_partiy_err_cnt),
-[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo8_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo7_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo6_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo5_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo4_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo3_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo2_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo1_unc_or_parity_err_cnt),
-[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_launch_fifo0_unc_or_parity_err_cnt),
-[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma15_disallowed_packet_err_cnt),
-[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma14_disallowed_packet_err_cnt),
-[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma13_disallowed_packet_err_cnt),
-[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma12_disallowed_packet_err_cnt),
-[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma11_disallowed_packet_err_cnt),
-[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma10_disallowed_packet_err_cnt),
-[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma9_disallowed_packet_err_cnt),
-[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma8_disallowed_packet_err_cnt),
-[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma7_disallowed_packet_err_cnt),
-[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma6_disallowed_packet_err_cnt),
-[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma5_disallowed_packet_err_cnt),
-[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma4_disallowed_packet_err_cnt),
-[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma3_disallowed_packet_err_cnt),
-[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma2_disallowed_packet_err_cnt),
-[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma1_disallowed_packet_err_cnt),
-[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma0_disallowed_packet_err_cnt),
-[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_config_parity_err_cnt),
-[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sbrd_ctl_csr_parity_err_cnt),
-[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_launch_csr_parity_err_cnt),
-[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_illegal_vl_err_cnt),
-[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
-                       "TxSbrdCtlStateMachineParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_sbrd_ctl_state_machine_parity_err_cnt),
-[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_10_err_cnt),
-[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_9_err_cnt),
-[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
-                       0, 0, CNTR_NORMAL,
-                       access_tx_sdma_launch_intf_parity_err_cnt),
-[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pio_launch_intf_parity_err_cnt),
-[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_6_err_cnt),
-[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_incorrect_link_state_err_cnt),
-[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_linkdown_err_cnt),
-[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
-                       "EgressFifoUnderrunOrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_egress_fifi_underrun_or_parity_err_cnt),
-[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
-                       CNTR_NORMAL,
-                       access_egress_reserved_2_err_cnt),
-[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pkt_integrity_mem_unc_err_cnt),
-[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_tx_pkt_integrity_mem_cor_err_cnt),
-/* SendErrStatus */
-[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_write_bad_addr_err_cnt),
-[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_read_bad_addr_err_cnt),
-[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_send_csr_parity_cnt),
-/* SendCtxtErrStatus */
-[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_out_of_bounds_err_cnt),
-[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_write_overflow_err_cnt),
-[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
-                       0, 0, CNTR_NORMAL,
-                       access_pio_write_crosses_boundary_err_cnt),
-[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_disallowed_packet_err_cnt),
-[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_pio_inconsistent_sop_err_cnt),
-/* SendDmaEngErrStatus */
-[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
-                       0, 0, CNTR_NORMAL,
-                       access_sdma_header_request_fifo_cor_err_cnt),
-[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_storage_cor_err_cnt),
-[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_tracking_cor_err_cnt),
-[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_assembly_cor_err_cnt),
-[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_desc_table_cor_err_cnt),
-[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
-                       0, 0, CNTR_NORMAL,
-                       access_sdma_header_request_fifo_unc_err_cnt),
-[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_storage_unc_err_cnt),
-[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_tracking_unc_err_cnt),
-[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_assembly_unc_err_cnt),
-[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_desc_table_unc_err_cnt),
-[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_timeout_err_cnt),
-[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_length_err_cnt),
-[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_address_err_cnt),
-[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_header_select_err_cnt),
-[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_reserved_9_err_cnt),
-[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_packet_desc_overflow_err_cnt),
-[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_length_mismatch_err_cnt),
-[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_halt_err_cnt),
-[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_mem_read_err_cnt),
-[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_first_desc_err_cnt),
-[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_tail_out_of_bounds_err_cnt),
-[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_too_long_err_cnt),
-[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_gen_mismatch_err_cnt),
-[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
-                       CNTR_NORMAL,
-                       access_sdma_wrong_dw_err_cnt),
-};
-
-static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
-[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
-                       CNTR_NORMAL),
-[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
-                       CNTR_NORMAL),
-[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
-                       CNTR_NORMAL),
-[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
-                       CNTR_NORMAL),
-[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
-[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
-[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
-[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
-                                     CNTR_SYNTH | CNTR_VL),
-[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
-                                    CNTR_SYNTH | CNTR_VL),
-[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
-                                     CNTR_SYNTH | CNTR_VL),
-[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
-[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
-[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                            access_sw_link_dn_cnt),
-[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                          access_sw_link_up_cnt),
-[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
-                                access_sw_unknown_frame_cnt),
-[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
-                            access_sw_xmit_discards),
-[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
-                               CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
-                               access_sw_xmit_discards),
-[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
-                                access_xmit_constraint_errs),
-[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
-                               access_rcv_constraint_errs),
-[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
-[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
-[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
-[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
-[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
-[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
-[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
-[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
-[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
-[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
-[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
-[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
-[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
-                              access_sw_cpu_rc_acks),
-[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
-                               access_sw_cpu_rc_qacks),
-[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
-                                      access_sw_cpu_rc_delayed_comp),
-[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
-[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
-[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
-[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
-[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
-[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
-[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
-[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
-[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
-[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
-[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
-[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
-[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
-[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
-[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
-[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
-[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
-[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
-[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
-[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
-[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
-[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
-[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
-[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
-[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
-[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
-[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
-[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
-[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
-[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
-[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
-[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
-[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
-[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
-[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
-[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
-[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
-[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
-[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
-[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
-[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
-[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
-[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
-[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
-[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
-[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
-[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
-[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
-[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
-[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
-[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
-[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
-[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
-[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
-[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
-[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
-[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
-[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
-[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
-[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
-[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
-[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
-[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
-[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
-[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
-[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
-[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
-[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
-[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
-[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
-[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
-[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
-[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
-[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
-[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
-[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
-[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
-[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
-[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
-[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
-};
-
-/* ======================================================================== */
-
-/* return true if this is chip revision revision a */
-int is_ax(struct hfi1_devdata *dd)
-{
-       u8 chip_rev_minor =
-               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-       return (chip_rev_minor & 0xf0) == 0;
-}
-
-/* return true if this is chip revision revision b */
-int is_bx(struct hfi1_devdata *dd)
-{
-       u8 chip_rev_minor =
-               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-       return (chip_rev_minor & 0xF0) == 0x10;
-}
-
-/*
- * Append string s to buffer buf.  Arguments curp and len are the current
- * position and remaining length, respectively.
- *
- * return 0 on success, 1 on out of room
- */
-static int append_str(char *buf, char **curp, int *lenp, const char *s)
-{
-       char *p = *curp;
-       int len = *lenp;
-       int result = 0; /* success */
-       char c;
-
-       /* add a comma, if first in the buffer */
-       if (p != buf) {
-               if (len == 0) {
-                       result = 1; /* out of room */
-                       goto done;
-               }
-               *p++ = ',';
-               len--;
-       }
-
-       /* copy the string */
-       while ((c = *s++) != 0) {
-               if (len == 0) {
-                       result = 1; /* out of room */
-                       goto done;
-               }
-               *p++ = c;
-               len--;
-       }
-
-done:
-       /* write return values */
-       *curp = p;
-       *lenp = len;
-
-       return result;
-}
-
-/*
- * Using the given flag table, print a comma separated string into
- * the buffer.  End in '*' if the buffer is too short.
- */
-static char *flag_string(char *buf, int buf_len, u64 flags,
-                        struct flag_table *table, int table_size)
-{
-       char extra[32];
-       char *p = buf;
-       int len = buf_len;
-       int no_room = 0;
-       int i;
-
-       /* make sure there is at least 2 so we can form "*" */
-       if (len < 2)
-               return "";
-
-       len--;  /* leave room for a nul */
-       for (i = 0; i < table_size; i++) {
-               if (flags & table[i].flag) {
-                       no_room = append_str(buf, &p, &len, table[i].str);
-                       if (no_room)
-                               break;
-                       flags &= ~table[i].flag;
-               }
-       }
-
-       /* any undocumented bits left? */
-       if (!no_room && flags) {
-               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
-               no_room = append_str(buf, &p, &len, extra);
-       }
-
-       /* add * if ran out of room */
-       if (no_room) {
-               /* may need to back up to add space for a '*' */
-               if (len == 0)
-                       --p;
-               *p++ = '*';
-       }
-
-       /* add final nul - space already allocated above */
-       *p = 0;
-       return buf;
-}
-
-/* first 8 CCE error interrupt source names */
-static const char * const cce_misc_names[] = {
-       "CceErrInt",            /* 0 */
-       "RxeErrInt",            /* 1 */
-       "MiscErrInt",           /* 2 */
-       "Reserved3",            /* 3 */
-       "PioErrInt",            /* 4 */
-       "SDmaErrInt",           /* 5 */
-       "EgressErrInt",         /* 6 */
-       "TxeErrInt"             /* 7 */
-};
-
-/*
- * Return the miscellaneous error interrupt name.
- */
-static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       if (source < ARRAY_SIZE(cce_misc_names))
-               strncpy(buf, cce_misc_names[source], bsize);
-       else
-               snprintf(buf, bsize, "Reserved%u",
-                        source + IS_GENERAL_ERR_START);
-
-       return buf;
-}
-
-/*
- * Return the SDMA engine error interrupt name.
- */
-static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
-       return buf;
-}
-
-/*
- * Return the send context error interrupt name.
- */
-static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
-       return buf;
-}
-
-static const char * const various_names[] = {
-       "PbcInt",
-       "GpioAssertInt",
-       "Qsfp1Int",
-       "Qsfp2Int",
-       "TCritInt"
-};
-
-/*
- * Return the various interrupt name.
- */
-static char *is_various_name(char *buf, size_t bsize, unsigned int source)
-{
-       if (source < ARRAY_SIZE(various_names))
-               strncpy(buf, various_names[source], bsize);
-       else
-               snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
-       return buf;
-}
-
-/*
- * Return the DC interrupt name.
- */
-static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
-{
-       static const char * const dc_int_names[] = {
-               "common",
-               "lcb",
-               "8051",
-               "lbm"   /* local block merge */
-       };
-
-       if (source < ARRAY_SIZE(dc_int_names))
-               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
-       else
-               snprintf(buf, bsize, "DCInt%u", source);
-       return buf;
-}
-
-static const char * const sdma_int_names[] = {
-       "SDmaInt",
-       "SdmaIdleInt",
-       "SdmaProgressInt",
-};
-
-/*
- * Return the SDMA engine interrupt name.
- */
-static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
-{
-       /* what interrupt */
-       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
-       /* which engine */
-       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
-
-       if (likely(what < 3))
-               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
-       else
-               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
-       return buf;
-}
-
-/*
- * Return the receive available interrupt name.
- */
-static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "RcvAvailInt%u", source);
-       return buf;
-}
-
-/*
- * Return the receive urgent interrupt name.
- */
-static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "RcvUrgentInt%u", source);
-       return buf;
-}
-
-/*
- * Return the send credit interrupt name.
- */
-static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "SendCreditInt%u", source);
-       return buf;
-}
-
-/*
- * Return the reserved interrupt name.
- */
-static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
-{
-       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
-       return buf;
-}
-
-static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          cce_err_status_flags,
-                          ARRAY_SIZE(cce_err_status_flags));
-}
-
-static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          rxe_err_status_flags,
-                          ARRAY_SIZE(rxe_err_status_flags));
-}
-
-static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, misc_err_status_flags,
-                          ARRAY_SIZE(misc_err_status_flags));
-}
-
-static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          pio_err_status_flags,
-                          ARRAY_SIZE(pio_err_status_flags));
-}
-
-static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          sdma_err_status_flags,
-                          ARRAY_SIZE(sdma_err_status_flags));
-}
-
-static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          egress_err_status_flags,
-                          ARRAY_SIZE(egress_err_status_flags));
-}
-
-static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          egress_err_info_flags,
-                          ARRAY_SIZE(egress_err_info_flags));
-}
-
-static char *send_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          send_err_status_flags,
-                          ARRAY_SIZE(send_err_status_flags));
-}
-
-static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       /*
-        * For most these errors, there is nothing that can be done except
-        * report or record it.
-        */
-       dd_dev_info(dd, "CCE Error: %s\n",
-                   cce_err_status_string(buf, sizeof(buf), reg));
-
-       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
-           is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
-               /* this error requires a manual drop into SPC freeze mode */
-               /* then a fix up */
-               start_freeze_handling(dd->pport, FREEZE_SELF);
-       }
-
-       for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i)) {
-                       incr_cntr64(&dd->cce_err_status_cnt[i]);
-                       /* maintain a counter over all cce_err_status errors */
-                       incr_cntr64(&dd->sw_cce_err_status_aggregate);
-               }
-       }
-}
-
-/*
- * Check counters for receive errors that do not have an interrupt
- * associated with them.
- */
-#define RCVERR_CHECK_TIME 10
-static void update_rcverr_timer(unsigned long opaque)
-{
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
-       struct hfi1_pportdata *ppd = dd->pport;
-       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
-
-       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
-           ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
-               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
-               set_link_down_reason(
-               ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
-               OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
-               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
-       }
-       dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
-
-       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
-}
-
-static int init_rcverr(struct hfi1_devdata *dd)
-{
-       setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
-       /* Assume the hardware counter has been reset */
-       dd->rcv_ovfl_cnt = 0;
-       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
-}
-
-static void free_rcverr(struct hfi1_devdata *dd)
-{
-       if (dd->rcverr_timer.data)
-               del_timer_sync(&dd->rcverr_timer);
-       dd->rcverr_timer.data = 0;
-}
-
-static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Receive Error: %s\n",
-                   rxe_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_RXE_FREEZE_ERR) {
-               int flags = 0;
-
-               /*
-                * Freeze mode recovery is disabled for the errors
-                * in RXE_FREEZE_ABORT_MASK
-                */
-               if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
-                       flags = FREEZE_ABORT;
-
-               start_freeze_handling(dd->pport, flags);
-       }
-
-       for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->rcv_err_status_cnt[i]);
-       }
-}
-
-static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Misc Error: %s",
-                   misc_err_status_string(buf, sizeof(buf), reg));
-       for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->misc_err_status_cnt[i]);
-       }
-}
-
-static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "PIO Error: %s\n",
-                   pio_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_PIO_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-
-       for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_pio_err_status_cnt[i]);
-       }
-}
-
-static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "SDMA Error: %s\n",
-                   sdma_err_status_string(buf, sizeof(buf), reg));
-
-       if (reg & ALL_SDMA_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-
-       for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_dma_err_status_cnt[i]);
-       }
-}
-
-static inline void __count_port_discards(struct hfi1_pportdata *ppd)
-{
-       incr_cntr64(&ppd->port_xmit_discards);
-}
-
-static void count_port_inactive(struct hfi1_devdata *dd)
-{
-       __count_port_discards(dd->pport);
-}
-
-/*
- * We have had a "disallowed packet" error during egress. Determine the
- * integrity check which failed, and update relevant error counter, etc.
- *
- * Note that the SEND_EGRESS_ERR_INFO register has only a single
- * bit of state per integrity check, and so we can miss the reason for an
- * egress error if more than one packet fails the same integrity check
- * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
- */
-static void handle_send_egress_err_info(struct hfi1_devdata *dd,
-                                       int vl)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
-       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
-       char buf[96];
-
-       /* clear down all observed info as quickly as possible after read */
-       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
-
-       dd_dev_info(dd,
-                   "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
-                   info, egress_err_info_string(buf, sizeof(buf), info), src);
-
-       /* Eventually add other counters for each bit */
-       if (info & PORT_DISCARD_EGRESS_ERRS) {
-               int weight, i;
-
-               /*
-                * Count all applicable bits as individual errors and
-                * attribute them to the packet that triggered this handler.
-                * This may not be completely accurate due to limitations
-                * on the available hardware error information.  There is
-                * a single information register and any number of error
-                * packets may have occurred and contributed to it before
-                * this routine is called.  This means that:
-                * a) If multiple packets with the same error occur before
-                *    this routine is called, earlier packets are missed.
-                *    There is only a single bit for each error type.
-                * b) Errors may not be attributed to the correct VL.
-                *    The driver is attributing all bits in the info register
-                *    to the packet that triggered this call, but bits
-                *    could be an accumulation of different packets with
-                *    different VLs.
-                * c) A single error packet may have multiple counts attached
-                *    to it.  There is no way for the driver to know if
-                *    multiple bits set in the info register are due to a
-                *    single packet or multiple packets.  The driver assumes
-                *    multiple packets.
-                */
-               weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
-               for (i = 0; i < weight; i++) {
-                       __count_port_discards(ppd);
-                       if (vl >= 0 && vl < TXE_NUM_DATA_VL)
-                               incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
-                       else if (vl == 15)
-                               incr_cntr64(&ppd->port_xmit_discards_vl
-                                           [C_VL_15]);
-               }
-       }
-}
-
-/*
- * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
- * register. Does it represent a 'port inactive' error?
- */
-static inline int port_inactive_err(u64 posn)
-{
-       return (posn >= SEES(TX_LINKDOWN) &&
-               posn <= SEES(TX_INCORRECT_LINK_STATE));
-}
-
-/*
- * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
- * register. Does it represent a 'disallowed packet' error?
- */
-static inline int disallowed_pkt_err(int posn)
-{
-       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
-               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
-}
-
-/*
- * Input value is a bit position of one of the SDMA engine disallowed
- * packet errors.  Return which engine.  Use of this must be guarded by
- * disallowed_pkt_err().
- */
-static inline int disallowed_pkt_engine(int posn)
-{
-       return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
-}
-
-/*
- * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
- * be done.
- */
-static int engine_to_vl(struct hfi1_devdata *dd, int engine)
-{
-       struct sdma_vl_map *m;
-       int vl;
-
-       /* range check */
-       if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
-               return -1;
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->sdma_map);
-       vl = m->engine_to_vl[engine];
-       rcu_read_unlock();
-
-       return vl;
-}
-
-/*
- * Translate the send context (sofware index) into a VL.  Return -1 if the
- * translation cannot be done.
- */
-static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
-{
-       struct send_context_info *sci;
-       struct send_context *sc;
-       int i;
-
-       sci = &dd->send_contexts[sw_index];
-
-       /* there is no information for user (PSM) and ack contexts */
-       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
-               return -1;
-
-       sc = sci->sc;
-       if (!sc)
-               return -1;
-       if (dd->vld[15].sc == sc)
-               return 15;
-       for (i = 0; i < num_vls; i++)
-               if (dd->vld[i].sc == sc)
-                       return i;
-
-       return -1;
-}
-
-static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       u64 reg_copy = reg, handled = 0;
-       char buf[96];
-       int i = 0;
-
-       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
-               start_freeze_handling(dd->pport, 0);
-       else if (is_ax(dd) &&
-                (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
-                (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
-               start_freeze_handling(dd->pport, 0);
-
-       while (reg_copy) {
-               int posn = fls64(reg_copy);
-               /* fls64() returns a 1-based offset, we want it zero based */
-               int shift = posn - 1;
-               u64 mask = 1ULL << shift;
-
-               if (port_inactive_err(shift)) {
-                       count_port_inactive(dd);
-                       handled |= mask;
-               } else if (disallowed_pkt_err(shift)) {
-                       int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
-
-                       handle_send_egress_err_info(dd, vl);
-                       handled |= mask;
-               }
-               reg_copy &= ~mask;
-       }
-
-       reg &= ~handled;
-
-       if (reg)
-               dd_dev_info(dd, "Egress Error: %s\n",
-                           egress_err_status_string(buf, sizeof(buf), reg));
-
-       for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_egress_err_status_cnt[i]);
-       }
-}
-
-static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-       int i = 0;
-
-       dd_dev_info(dd, "Send Error: %s\n",
-                   send_err_status_string(buf, sizeof(buf), reg));
-
-       for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
-               if (reg & (1ull << i))
-                       incr_cntr64(&dd->send_err_status_cnt[i]);
-       }
-}
-
-/*
- * The maximum number of times the error clear down will loop before
- * blocking a repeating error.  This value is arbitrary.
- */
-#define MAX_CLEAR_COUNT 20
-
-/*
- * Clear and handle an error register.  All error interrupts are funneled
- * through here to have a central location to correctly handle single-
- * or multi-shot errors.
- *
- * For non per-context registers, call this routine with a context value
- * of 0 so the per-context offset is zero.
- *
- * If the handler loops too many times, assume that something is wrong
- * and can't be fixed, so mask the error bits.
- */
-static void interrupt_clear_down(struct hfi1_devdata *dd,
-                                u32 context,
-                                const struct err_reg_info *eri)
-{
-       u64 reg;
-       u32 count;
-
-       /* read in a loop until no more errors are seen */
-       count = 0;
-       while (1) {
-               reg = read_kctxt_csr(dd, context, eri->status);
-               if (reg == 0)
-                       break;
-               write_kctxt_csr(dd, context, eri->clear, reg);
-               if (likely(eri->handler))
-                       eri->handler(dd, context, reg);
-               count++;
-               if (count > MAX_CLEAR_COUNT) {
-                       u64 mask;
-
-                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
-                                  eri->desc, reg);
-                       /*
-                        * Read-modify-write so any other masked bits
-                        * remain masked.
-                        */
-                       mask = read_kctxt_csr(dd, context, eri->mask);
-                       mask &= ~reg;
-                       write_kctxt_csr(dd, context, eri->mask, mask);
-                       break;
-               }
-       }
-}
-
-/*
- * CCE block "misc" interrupt.  Source is < 16.
- */
-static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &misc_errs[source];
-
-       if (eri->handler) {
-               interrupt_clear_down(dd, 0, eri);
-       } else {
-               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
-                          source);
-       }
-}
-
-static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags,
-                          sc_err_status_flags,
-                          ARRAY_SIZE(sc_err_status_flags));
-}
-
-/*
- * Send context error interrupt.  Source (hw_context) is < 160.
- *
- * All send context errors cause the send context to halt.  The normal
- * clear-down mechanism cannot be used because we cannot clear the
- * error bits until several other long-running items are done first.
- * This is OK because with the context halted, nothing else is going
- * to happen on it anyway.
- */
-static void is_sendctxt_err_int(struct hfi1_devdata *dd,
-                               unsigned int hw_context)
-{
-       struct send_context_info *sci;
-       struct send_context *sc;
-       char flags[96];
-       u64 status;
-       u32 sw_index;
-       int i = 0;
-
-       sw_index = dd->hw_to_sw[hw_context];
-       if (sw_index >= dd->num_send_contexts) {
-               dd_dev_err(dd,
-                          "out of range sw index %u for send context %u\n",
-                          sw_index, hw_context);
-               return;
-       }
-       sci = &dd->send_contexts[sw_index];
-       sc = sci->sc;
-       if (!sc) {
-               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
-                          sw_index, hw_context);
-               return;
-       }
-
-       /* tell the software that a halt has begun */
-       sc_stop(sc, SCF_HALTED);
-
-       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
-
-       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
-                   send_context_err_status_string(flags, sizeof(flags),
-                                                  status));
-
-       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
-               handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
-
-       /*
-        * Automatically restart halted kernel contexts out of interrupt
-        * context.  User contexts must ask the driver to restart the context.
-        */
-       if (sc->type != SC_USER)
-               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
-
-       /*
-        * Update the counters for the corresponding status bits.
-        * Note that these particular counters are aggregated over all
-        * 160 contexts.
-        */
-       for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
-               if (status & (1ull << i))
-                       incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
-       }
-}
-
-static void handle_sdma_eng_err(struct hfi1_devdata *dd,
-                               unsigned int source, u64 status)
-{
-       struct sdma_engine *sde;
-       int i = 0;
-
-       sde = &dd->per_sdma[source];
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
-                  sde->this_idx, source, (unsigned long long)status);
-#endif
-       sde->err_cnt++;
-       sdma_engine_error(sde, status);
-
-       /*
-       * Update the counters for the corresponding status bits.
-       * Note that these particular counters are aggregated over
-       * all 16 DMA engines.
-       */
-       for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
-               if (status & (1ull << i))
-                       incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
-       }
-}
-
-/*
- * CCE block SDMA error interrupt.  Source is < 16.
- */
-static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
-{
-#ifdef CONFIG_SDMA_VERBOSITY
-       struct sdma_engine *sde = &dd->per_sdma[source];
-
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
-                  source);
-       sdma_dumpstate(sde);
-#endif
-       interrupt_clear_down(dd, source, &sdma_eng_err);
-}
-
-/*
- * CCE block "various" interrupt.  Source is < 8.
- */
-static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &various_err[source];
-
-       /*
-        * TCritInt cannot go through interrupt_clear_down()
-        * because it is not a second tier interrupt. The handler
-        * should be called directly.
-        */
-       if (source == TCRIT_INT_SOURCE)
-               handle_temp_err(dd);
-       else if (eri->handler)
-               interrupt_clear_down(dd, 0, eri);
-       else
-               dd_dev_info(dd,
-                           "%s: Unimplemented/reserved interrupt %d\n",
-                           __func__, source);
-}
-
-static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
-{
-       /* src_ctx is always zero */
-       struct hfi1_pportdata *ppd = dd->pport;
-       unsigned long flags;
-       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
-
-       if (reg & QSFP_HFI0_MODPRST_N) {
-               if (!qsfp_mod_present(ppd)) {
-                       dd_dev_info(dd, "%s: QSFP module removed\n",
-                                   __func__);
-
-                       ppd->driver_link_ready = 0;
-                       /*
-                        * Cable removed, reset all our information about the
-                        * cache and cable capabilities
-                        */
-
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       /*
-                        * We don't set cache_refresh_required here as we expect
-                        * an interrupt when a cable is inserted
-                        */
-                       ppd->qsfp_info.cache_valid = 0;
-                       ppd->qsfp_info.reset_needed = 0;
-                       ppd->qsfp_info.limiting_active = 0;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-                       /* Invert the ModPresent pin now to detect plug-in */
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
-                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
-
-                       if ((ppd->offline_disabled_reason >
-                         HFI1_ODR_MASK(
-                         OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
-                         (ppd->offline_disabled_reason ==
-                         HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
-                               ppd->offline_disabled_reason =
-                               HFI1_ODR_MASK(
-                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
-
-                       if (ppd->host_link_state == HLS_DN_POLL) {
-                               /*
-                                * The link is still in POLL. This means
-                                * that the normal link down processing
-                                * will not happen. We have to do it here
-                                * before turning the DC off.
-                                */
-                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
-                       }
-               } else {
-                       dd_dev_info(dd, "%s: QSFP module inserted\n",
-                                   __func__);
-
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       ppd->qsfp_info.cache_valid = 0;
-                       ppd->qsfp_info.cache_refresh_required = 1;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-
-                       /*
-                        * Stop inversion of ModPresent pin to detect
-                        * removal of the cable
-                        */
-                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
-                                 ASIC_QSFP1_INVERT, qsfp_int_mgmt);
-
-                       ppd->offline_disabled_reason =
-                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
-               }
-       }
-
-       if (reg & QSFP_HFI0_INT_N) {
-               dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
-                           __func__);
-               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-               ppd->qsfp_info.check_interrupt_flags = 1;
-               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-       }
-
-       /* Schedule the QSFP work only if there is a cable attached. */
-       if (qsfp_mod_present(ppd))
-               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
-}
-
-static int request_host_lcb_access(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_MISC,
-                             (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
-                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "%s: command failed with error %d\n",
-                          __func__, ret);
-       }
-       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
-}
-
-static int request_8051_lcb_access(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_MISC,
-                             (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
-                             LOAD_DATA_FIELD_ID_SHIFT, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "%s: command failed with error %d\n",
-                          __func__, ret);
-       }
-       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
-}
-
-/*
- * Set the LCB selector - allow host access.  The DCC selector always
- * points to the host.
- */
-static inline void set_host_lcb_access(struct hfi1_devdata *dd)
-{
-       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
-}
-
-/*
- * Clear the LCB selector - allow 8051 access.  The DCC selector always
- * points to the host.
- */
-static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
-{
-       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
-                 DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
-}
-
-/*
- * Acquire LCB access from the 8051.  If the host already has access,
- * just increment a counter.  Otherwise, inform the 8051 that the
- * host is taking access.
- *
- * Returns:
- *     0 on success
- *     -EBUSY if the 8051 has control and cannot be disturbed
- *     -errno if unable to acquire access from the 8051
- */
-int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       int ret = 0;
-
-       /*
-        * Use the host link state lock so the operation of this routine
-        * { link state check, selector change, count increment } can occur
-        * as a unit against a link state change.  Otherwise there is a
-        * race between the state change and the count increment.
-        */
-       if (sleep_ok) {
-               mutex_lock(&ppd->hls_lock);
-       } else {
-               while (!mutex_trylock(&ppd->hls_lock))
-                       udelay(1);
-       }
-
-       /* this access is valid only when the link is up */
-       if ((ppd->host_link_state & HLS_UP) == 0) {
-               dd_dev_info(dd, "%s: link state %s not up\n",
-                           __func__, link_state_name(ppd->host_link_state));
-               ret = -EBUSY;
-               goto done;
-       }
-
-       if (dd->lcb_access_count == 0) {
-               ret = request_host_lcb_access(dd);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "%s: unable to acquire LCB access, err %d\n",
-                                  __func__, ret);
-                       goto done;
-               }
-               set_host_lcb_access(dd);
-       }
-       dd->lcb_access_count++;
-done:
-       mutex_unlock(&ppd->hls_lock);
-       return ret;
-}
-
-/*
- * Release LCB access by decrementing the use count.  If the count is moving
- * from 1 to 0, inform 8051 that it has control back.
- *
- * Returns:
- *     0 on success
- *     -errno if unable to release access to the 8051
- */
-int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
-{
-       int ret = 0;
-
-       /*
-        * Use the host link state lock because the acquire needed it.
-        * Here, we only need to keep { selector change, count decrement }
-        * as a unit.
-        */
-       if (sleep_ok) {
-               mutex_lock(&dd->pport->hls_lock);
-       } else {
-               while (!mutex_trylock(&dd->pport->hls_lock))
-                       udelay(1);
-       }
-
-       if (dd->lcb_access_count == 0) {
-               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
-                          __func__);
-               goto done;
-       }
-
-       if (dd->lcb_access_count == 1) {
-               set_8051_lcb_access(dd);
-               ret = request_8051_lcb_access(dd);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "%s: unable to release LCB access, err %d\n",
-                                  __func__, ret);
-                       /* restore host access if the grant didn't work */
-                       set_host_lcb_access(dd);
-                       goto done;
-               }
-       }
-       dd->lcb_access_count--;
-done:
-       mutex_unlock(&dd->pport->hls_lock);
-       return ret;
-}
-
-/*
- * Initialize LCB access variables and state.  Called during driver load,
- * after most of the initialization is finished.
- *
- * The DC default is LCB access on for the host.  The driver defaults to
- * leaving access to the 8051.  Assign access now - this constrains the call
- * to this routine to be after all LCB set-up is done.  In particular, after
- * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
- */
-static void init_lcb_access(struct hfi1_devdata *dd)
-{
-       dd->lcb_access_count = 0;
-}
-
-/*
- * Write a response back to a 8051 request.
- */
-static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
-{
-       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
-                 DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
-                 (u64)return_code <<
-                 DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
-                 (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
-}
-
-/*
- * Handle host requests from the 8051.
- */
-static void handle_8051_request(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       u16 data = 0;
-       u8 type;
-
-       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
-       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
-               return; /* no request */
-
-       /* zero out COMPLETED so the response is seen */
-       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
-
-       /* extract request details */
-       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
-                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
-       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
-                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
-
-       switch (type) {
-       case HREQ_LOAD_CONFIG:
-       case HREQ_SAVE_CONFIG:
-       case HREQ_READ_CONFIG:
-       case HREQ_SET_TX_EQ_ABS:
-       case HREQ_SET_TX_EQ_REL:
-       case HREQ_ENABLE:
-               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
-                           type);
-               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
-               break;
-       case HREQ_CONFIG_DONE:
-               hreq_response(dd, HREQ_SUCCESS, 0);
-               break;
-
-       case HREQ_INTERFACE_TEST:
-               hreq_response(dd, HREQ_SUCCESS, data);
-               break;
-       default:
-               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
-               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
-               break;
-       }
-}
-
-static void write_global_credit(struct hfi1_devdata *dd,
-                               u8 vau, u16 total, u16 shared)
-{
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
-                 ((u64)total <<
-                  SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
-                 ((u64)shared <<
-                  SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
-                 ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
-}
-
-/*
- * Set up initial VL15 credits of the remote.  Assumes the rest of
- * the CM credit registers are zero from a previous global or credit reset .
- */
-void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
-{
-       /* leave shared count at zero for both global and VL15 */
-       write_global_credit(dd, vau, vl15buf, 0);
-
-       /* We may need some credits for another VL when sending packets
-        * with the snoop interface. Dividing it down the middle for VL15
-        * and VL0 should suffice.
-        */
-       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
-               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
-                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
-               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
-                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
-       } else {
-               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
-                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
-       }
-}
-
-/*
- * Zero all credit details from the previous connection and
- * reset the CM manager's internal counters.
- */
-void reset_link_credits(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* remove all previous VL credit limits */
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
-       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
-       write_global_credit(dd, 0, 0, 0);
-       /* reset the CM block */
-       pio_send_control(dd, PSC_CM_RESET);
-}
-
-/* convert a vCU to a CU */
-static u32 vcu_to_cu(u8 vcu)
-{
-       return 1 << vcu;
-}
-
-/* convert a CU to a vCU */
-static u8 cu_to_vcu(u32 cu)
-{
-       return ilog2(cu);
-}
-
-/* convert a vAU to an AU */
-static u32 vau_to_au(u8 vau)
-{
-       return 8 * (1 << vau);
-}
-
-static void set_linkup_defaults(struct hfi1_pportdata *ppd)
-{
-       ppd->sm_trap_qp = 0x0;
-       ppd->sa_qp = 0x1;
-}
-
-/*
- * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
- */
-static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
-{
-       u64 reg;
-
-       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
-       write_csr(dd, DC_LCB_CFG_RUN, 0);
-       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
-                 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
-       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
-       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
-       reg = read_csr(dd, DCC_CFG_RESET);
-       write_csr(dd, DCC_CFG_RESET, reg |
-                 (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
-                 (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
-       (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
-       if (!abort) {
-               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
-               write_csr(dd, DCC_CFG_RESET, reg);
-               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
-       }
-}
-
-/*
- * This routine should be called after the link has been transitioned to
- * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
- * reset).
- *
- * The expectation is that the caller of this routine would have taken
- * care of properly transitioning the link into the correct state.
- */
-static void dc_shutdown(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       if (dd->dc_shutdown) {
-               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-               return;
-       }
-       dd->dc_shutdown = 1;
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-       /* Shutdown the LCB */
-       lcb_shutdown(dd, 1);
-       /*
-        * Going to OFFLINE would have causes the 8051 to put the
-        * SerDes into reset already. Just need to shut down the 8051,
-        * itself.
-        */
-       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
-}
-
-/*
- * Calling this after the DC has been brought out of reset should not
- * do any damage.
- */
-static void dc_start(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       if (!dd->dc_shutdown)
-               goto done;
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-       /* Take the 8051 out of reset */
-       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
-       /* Wait until 8051 is ready */
-       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
-       if (ret) {
-               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
-                          __func__);
-       }
-       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
-       write_csr(dd, DCC_CFG_RESET, 0x10);
-       /* lcb_shutdown() with abort=1 does not restore these */
-       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-       dd->dc_shutdown = 0;
-done:
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-}
-
-/*
- * These LCB adjustments are for the Aurora SerDes core in the FPGA.
- */
-static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
-{
-       u64 rx_radr, tx_radr;
-       u32 version;
-
-       if (dd->icode != ICODE_FPGA_EMULATION)
-               return;
-
-       /*
-        * These LCB defaults on emulator _s are good, nothing to do here:
-        *      LCB_CFG_TX_FIFOS_RADR
-        *      LCB_CFG_RX_FIFOS_RADR
-        *      LCB_CFG_LN_DCLK
-        *      LCB_CFG_IGNORE_LOST_RCLK
-        */
-       if (is_emulator_s(dd))
-               return;
-       /* else this is _p */
-
-       version = emulator_rev(dd);
-       if (!is_ax(dd))
-               version = 0x2d; /* all B0 use 0x2d or higher settings */
-
-       if (version <= 0x12) {
-               /* release 0x12 and below */
-
-               /*
-                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
-                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
-                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
-                */
-               rx_radr =
-                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               /*
-                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
-                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
-                */
-               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version <= 0x18) {
-               /* release 0x13 up to 0x18 */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
-               rx_radr =
-                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version == 0x19) {
-               /* release 0x19 */
-               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
-               rx_radr =
-                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       } else if (version == 0x1a) {
-               /* release 0x1a */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
-               rx_radr =
-                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
-       } else {
-               /* release 0x1b and higher */
-               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
-               rx_radr =
-                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
-                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
-                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
-               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
-       }
-
-       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
-       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
-       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
-                 DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
-}
-
-/*
- * Handle a SMA idle message
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_sma_message(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       sma_message_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 msg;
-       int ret;
-
-       /*
-        * msg is bytes 1-4 of the 40-bit idle message - the command code
-        * is stripped off
-        */
-       ret = read_idle_sma(dd, &msg);
-       if (ret)
-               return;
-       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
-       /*
-        * React to the SMA message.  Byte[1] (0 for us) is the command.
-        */
-       switch (msg & 0xff) {
-       case SMA_IDLE_ARM:
-               /*
-                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
-                * State Transitions
-                *
-                * Only expected in INIT or ARMED, discard otherwise.
-                */
-               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
-                       ppd->neighbor_normal = 1;
-               break;
-       case SMA_IDLE_ACTIVE:
-               /*
-                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
-                * State Transitions
-                *
-                * Can activate the node.  Discard otherwise.
-                */
-               if (ppd->host_link_state == HLS_UP_ARMED &&
-                   ppd->is_active_optimize_enabled) {
-                       ppd->neighbor_normal = 1;
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (ret)
-                               dd_dev_err(
-                                       dd,
-                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
-                                       __func__);
-               }
-               break;
-       default:
-               dd_dev_err(dd,
-                          "%s: received unexpected SMA idle message 0x%llx\n",
-                          __func__, msg);
-               break;
-       }
-}
-
-static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
-{
-       u64 rcvctrl;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
-       rcvctrl = read_csr(dd, RCV_CTRL);
-       rcvctrl |= add;
-       rcvctrl &= ~clear;
-       write_csr(dd, RCV_CTRL, rcvctrl);
-       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
-}
-
-static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
-{
-       adjust_rcvctrl(dd, add, 0);
-}
-
-static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
-{
-       adjust_rcvctrl(dd, 0, clear);
-}
-
-/*
- * Called from all interrupt handlers to start handling an SPC freeze.
- */
-void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct send_context *sc;
-       int i;
-
-       if (flags & FREEZE_SELF)
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
-
-       /* enter frozen mode */
-       dd->flags |= HFI1_FROZEN;
-
-       /* notify all SDMA engines that they are going into a freeze */
-       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
-
-       /* do halt pre-handling on all enabled send contexts */
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               if (sc && (sc->flags & SCF_ENABLED))
-                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
-       }
-
-       /* Send context are frozen. Notify user space */
-       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
-
-       if (flags & FREEZE_ABORT) {
-               dd_dev_err(dd,
-                          "Aborted freeze recovery. Please REBOOT system\n");
-               return;
-       }
-       /* queue non-interrupt handler */
-       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
-}
-
-/*
- * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
- * depending on the "freeze" parameter.
- *
- * No need to return an error if it times out, our only option
- * is to proceed anyway.
- */
-static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, CCE_STATUS);
-               if (freeze) {
-                       /* waiting until all indicators are set */
-                       if ((reg & ALL_FROZE) == ALL_FROZE)
-                               return; /* all done */
-               } else {
-                       /* waiting until all indicators are clear */
-                       if ((reg & ALL_FROZE) == 0)
-                               return; /* all done */
-               }
-
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
-                                  freeze ? "" : "un", reg & ALL_FROZE,
-                                  freeze ? ALL_FROZE : 0ull);
-                       return;
-               }
-               usleep_range(80, 120);
-       }
-}
-
-/*
- * Do all freeze handling for the RXE block.
- */
-static void rxe_freeze(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* disable port */
-       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-       /* disable all receive contexts */
-       for (i = 0; i < dd->num_rcv_contexts; i++)
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
-}
-
-/*
- * Unfreeze handling for the RXE block - kernel contexts only.
- * This will also enable the port.  User contexts will do unfreeze
- * handling on a per-context basis as they call into the driver.
- *
- */
-static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
-{
-       u32 rcvmask;
-       int i;
-
-       /* enable all kernel contexts */
-       for (i = 0; i < dd->n_krcv_queues; i++) {
-               rcvmask = HFI1_RCVCTRL_CTXT_ENB;
-               /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
-               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
-                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
-               hfi1_rcvctrl(dd, rcvmask, i);
-       }
-
-       /* enable port */
-       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-}
-
-/*
- * Non-interrupt SPC freeze handling.
- *
- * This is a work-queue function outside of the triggering interrupt.
- */
-void handle_freeze(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               freeze_work);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /* wait for freeze indicators on all affected blocks */
-       wait_for_freeze_status(dd, 1);
-
-       /* SPC is now frozen */
-
-       /* do send PIO freeze steps */
-       pio_freeze(dd);
-
-       /* do send DMA freeze steps */
-       sdma_freeze(dd);
-
-       /* do send egress freeze steps - nothing to do */
-
-       /* do receive freeze steps */
-       rxe_freeze(dd);
-
-       /*
-        * Unfreeze the hardware - clear the freeze, wait for each
-        * block's frozen bit to clear, then clear the frozen flag.
-        */
-       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
-       wait_for_freeze_status(dd, 0);
-
-       if (is_ax(dd)) {
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
-               wait_for_freeze_status(dd, 1);
-               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
-               wait_for_freeze_status(dd, 0);
-       }
-
-       /* do send PIO unfreeze steps for kernel contexts */
-       pio_kernel_unfreeze(dd);
-
-       /* do send DMA unfreeze steps */
-       sdma_unfreeze(dd);
-
-       /* do send egress unfreeze steps - nothing to do */
-
-       /* do receive unfreeze steps for kernel contexts */
-       rxe_kernel_unfreeze(dd);
-
-       /*
-        * The unfreeze procedure touches global device registers when
-        * it disables and re-enables RXE. Mark the device unfrozen
-        * after all that is done so other parts of the driver waiting
-        * for the device to unfreeze don't do things out of order.
-        *
-        * The above implies that the meaning of HFI1_FROZEN flag is
-        * "Device has gone into freeze mode and freeze mode handling
-        * is still in progress."
-        *
-        * The flag will be removed when freeze mode processing has
-        * completed.
-        */
-       dd->flags &= ~HFI1_FROZEN;
-       wake_up(&dd->event_queue);
-
-       /* no longer frozen */
-}
-
-/*
- * Handle a link up interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_up(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 link_up_work);
-       set_link_state(ppd, HLS_UP_INIT);
-
-       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
-       read_ltp_rtt(ppd->dd);
-       /*
-        * OPA specifies that certain counters are cleared on a transition
-        * to link up, so do that.
-        */
-       clear_linkup_counters(ppd->dd);
-       /*
-        * And (re)set link up default values.
-        */
-       set_linkup_defaults(ppd);
-
-       /* enforce link speed enabled */
-       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
-               /* oops - current speed is not enabled, bounce */
-               dd_dev_err(ppd->dd,
-                          "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
-                          ppd->link_speed_active, ppd->link_speed_enabled);
-               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
-                                    OPA_LINKDOWN_REASON_SPEED_POLICY);
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-/*
- * Several pieces of LNI information were cached for SMA in ppd.
- * Reset these on link down
- */
-static void reset_neighbor_info(struct hfi1_pportdata *ppd)
-{
-       ppd->neighbor_guid = 0;
-       ppd->neighbor_port_number = 0;
-       ppd->neighbor_type = 0;
-       ppd->neighbor_fm_security = 0;
-}
-
-static const char * const link_down_reason_strs[] = {
-       [OPA_LINKDOWN_REASON_NONE] = "None",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
-       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
-       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
-       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
-       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
-       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
-       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
-       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
-       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
-       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
-       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
-       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
-       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
-       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
-       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
-       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
-       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
-       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
-       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
-       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
-       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
-       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
-                                       "Excessive buffer overrun",
-       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
-       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
-       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
-       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
-       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
-       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
-       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
-       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
-                                       "Local media not installed",
-       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
-       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
-       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
-                                       "End to end not installed",
-       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
-       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
-       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
-       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
-       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
-       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
-};
-
-/* return the neighbor link down reason string */
-static const char *link_down_reason_str(u8 reason)
-{
-       const char *str = NULL;
-
-       if (reason < ARRAY_SIZE(link_down_reason_strs))
-               str = link_down_reason_strs[reason];
-       if (!str)
-               str = "(invalid)";
-
-       return str;
-}
-
-/*
- * Handle a link down interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_down(struct work_struct *work)
-{
-       u8 lcl_reason, neigh_reason = 0;
-       u8 link_down_reason;
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 link_down_work);
-       int was_up;
-       static const char ldr_str[] = "Link down reason: ";
-
-       if ((ppd->host_link_state &
-            (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
-            ppd->port_type == PORT_TYPE_FIXED)
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
-
-       /* Go offline first, then deal with reading/writing through 8051 */
-       was_up = !!(ppd->host_link_state & HLS_UP);
-       set_link_state(ppd, HLS_DN_OFFLINE);
-
-       if (was_up) {
-               lcl_reason = 0;
-               /* link down reason is only valid if the link was up */
-               read_link_down_reason(ppd->dd, &link_down_reason);
-               switch (link_down_reason) {
-               case LDR_LINK_TRANSFER_ACTIVE_LOW:
-                       /* the link went down, no idle message reason */
-                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
-                                   ldr_str);
-                       break;
-               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
-                       /*
-                        * The neighbor reason is only valid if an idle message
-                        * was received for it.
-                        */
-                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
-                       dd_dev_info(ppd->dd,
-                                   "%sNeighbor link down message %d, %s\n",
-                                   ldr_str, neigh_reason,
-                                   link_down_reason_str(neigh_reason));
-                       break;
-               case LDR_RECEIVED_HOST_OFFLINE_REQ:
-                       dd_dev_info(ppd->dd,
-                                   "%sHost requested link to go offline\n",
-                                   ldr_str);
-                       break;
-               default:
-                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
-                                   ldr_str, link_down_reason);
-                       break;
-               }
-
-               /*
-                * If no reason, assume peer-initiated but missed
-                * LinkGoingDown idle flits.
-                */
-               if (neigh_reason == 0)
-                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
-       } else {
-               /* went down while polling or going up */
-               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
-       }
-
-       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
-
-       /* inform the SMA when the link transitions from up to down */
-       if (was_up && ppd->local_link_down_reason.sma == 0 &&
-           ppd->neigh_link_down_reason.sma == 0) {
-               ppd->local_link_down_reason.sma =
-                                       ppd->local_link_down_reason.latest;
-               ppd->neigh_link_down_reason.sma =
-                                       ppd->neigh_link_down_reason.latest;
-       }
-
-       reset_neighbor_info(ppd);
-
-       /* disable the port */
-       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-       /*
-        * If there is no cable attached, turn the DC off. Otherwise,
-        * start the link bring up.
-        */
-       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
-               dc_shutdown(ppd->dd);
-       } else {
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-void handle_link_bounce(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       link_bounce_work);
-
-       /*
-        * Only do something if the link is currently up.
-        */
-       if (ppd->host_link_state & HLS_UP) {
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       } else {
-               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
-                           __func__, link_state_name(ppd->host_link_state));
-       }
-}
-
-/*
- * Mask conversion: Capability exchange to Port LTP.  The capability
- * exchange has an implicit 16b CRC that is mandatory.
- */
-static int cap_to_port_ltp(int cap)
-{
-       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
-
-       if (cap & CAP_CRC_14B)
-               port_ltp |= PORT_LTP_CRC_MODE_14;
-       if (cap & CAP_CRC_48B)
-               port_ltp |= PORT_LTP_CRC_MODE_48;
-       if (cap & CAP_CRC_12B_16B_PER_LANE)
-               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
-
-       return port_ltp;
-}
-
-/*
- * Convert an OPA Port LTP mask to capability mask
- */
-int port_ltp_to_cap(int port_ltp)
-{
-       int cap_mask = 0;
-
-       if (port_ltp & PORT_LTP_CRC_MODE_14)
-               cap_mask |= CAP_CRC_14B;
-       if (port_ltp & PORT_LTP_CRC_MODE_48)
-               cap_mask |= CAP_CRC_48B;
-       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
-               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
-
-       return cap_mask;
-}
-
-/*
- * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
- */
-static int lcb_to_port_ltp(int lcb_crc)
-{
-       int port_ltp = 0;
-
-       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
-               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
-       else if (lcb_crc == LCB_CRC_48B)
-               port_ltp = PORT_LTP_CRC_MODE_48;
-       else if (lcb_crc == LCB_CRC_14B)
-               port_ltp = PORT_LTP_CRC_MODE_14;
-       else
-               port_ltp = PORT_LTP_CRC_MODE_16;
-
-       return port_ltp;
-}
-
-/*
- * Our neighbor has indicated that we are allowed to act as a fabric
- * manager, so place the full management partition key in the second
- * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
- * that we should already have the limited management partition key in
- * array element 1, and also that the port is not yet up when
- * add_full_mgmt_pkey() is invoked.
- */
-static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */
-       if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
-               dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
-                           __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
-       ppd->pkeys[2] = FULL_MGMT_P_KEY;
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
-}
-
-/*
- * Convert the given link width to the OPA link width bitmask.
- */
-static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
-{
-       switch (width) {
-       case 0:
-               /*
-                * Simulator and quick linkup do not set the width.
-                * Just set it to 4x without complaint.
-                */
-               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
-                       return OPA_LINK_WIDTH_4X;
-               return 0; /* no lanes up */
-       case 1: return OPA_LINK_WIDTH_1X;
-       case 2: return OPA_LINK_WIDTH_2X;
-       case 3: return OPA_LINK_WIDTH_3X;
-       default:
-               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
-                           __func__, width);
-               /* fall through */
-       case 4: return OPA_LINK_WIDTH_4X;
-       }
-}
-
-/*
- * Do a population count on the bottom nibble.
- */
-static const u8 bit_counts[16] = {
-       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
-};
-
-static inline u8 nibble_to_count(u8 nibble)
-{
-       return bit_counts[nibble & 0xf];
-}
-
-/*
- * Read the active lane information from the 8051 registers and return
- * their widths.
- *
- * Active lane information is found in these 8051 registers:
- *     enable_lane_tx
- *     enable_lane_rx
- */
-static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
-                           u16 *rx_width)
-{
-       u16 tx, rx;
-       u8 enable_lane_rx;
-       u8 enable_lane_tx;
-       u8 tx_polarity_inversion;
-       u8 rx_polarity_inversion;
-       u8 max_rate;
-
-       /* read the active lanes */
-       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-                        &rx_polarity_inversion, &max_rate);
-       read_local_lni(dd, &enable_lane_rx);
-
-       /* convert to counts */
-       tx = nibble_to_count(enable_lane_tx);
-       rx = nibble_to_count(enable_lane_rx);
-
-       /*
-        * Set link_speed_active here, overriding what was set in
-        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
-        * set the max_rate field in handle_verify_cap until v0.19.
-        */
-       if ((dd->icode == ICODE_RTL_SILICON) &&
-           (dd->dc8051_ver < dc8051_ver(0, 19))) {
-               /* max_rate: 0 = 12.5G, 1 = 25G */
-               switch (max_rate) {
-               case 0:
-                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
-                       break;
-               default:
-                       dd_dev_err(dd,
-                                  "%s: unexpected max rate %d, using 25Gb\n",
-                                  __func__, (int)max_rate);
-                       /* fall through */
-               case 1:
-                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
-                       break;
-               }
-       }
-
-       dd_dev_info(dd,
-                   "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
-                   enable_lane_tx, tx, enable_lane_rx, rx);
-       *tx_width = link_width_to_bits(dd, tx);
-       *rx_width = link_width_to_bits(dd, rx);
-}
-
-/*
- * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
- * Valid after the end of VerifyCap and during LinkUp.  Does not change
- * after link up.  I.e. look elsewhere for downgrade information.
- *
- * Bits are:
- *     + bits [7:4] contain the number of active transmitters
- *     + bits [3:0] contain the number of active receivers
- * These are numbers 1 through 4 and can be different values if the
- * link is asymmetric.
- *
- * verify_cap_local_fm_link_width[0] retains its original value.
- */
-static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
-                             u16 *rx_width)
-{
-       u16 widths, tx, rx;
-       u8 misc_bits, local_flags;
-       u16 active_tx, active_rx;
-
-       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
-       tx = widths >> 12;
-       rx = (widths >> 8) & 0xf;
-
-       *tx_width = link_width_to_bits(dd, tx);
-       *rx_width = link_width_to_bits(dd, rx);
-
-       /* print the active widths */
-       get_link_widths(dd, &active_tx, &active_rx);
-}
-
-/*
- * Set ppd->link_width_active and ppd->link_width_downgrade_active using
- * hardware information when the link first comes up.
- *
- * The link width is not available until after VerifyCap.AllFramesReceived
- * (the trigger for handle_verify_cap), so this is outside that routine
- * and should be called when the 8051 signals linkup.
- */
-void get_linkup_link_widths(struct hfi1_pportdata *ppd)
-{
-       u16 tx_width, rx_width;
-
-       /* get end-of-LNI link widths */
-       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
-
-       /* use tx_width as the link is supposed to be symmetric on link up */
-       ppd->link_width_active = tx_width;
-       /* link width downgrade active (LWD.A) starts out matching LW.A */
-       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
-       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
-       /* per OPA spec, on link up LWD.E resets to LWD.S */
-       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
-       /* cache the active egress rate (units {10^6 bits/sec]) */
-       ppd->current_egress_rate = active_egress_rate(ppd);
-}
-
-/*
- * Handle a verify capabilities interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_verify_cap(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               link_vc_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       u8 power_management;
-       u8 continious;
-       u8 vcu;
-       u8 vau;
-       u8 z;
-       u16 vl15buf;
-       u16 link_widths;
-       u16 crc_mask;
-       u16 crc_val;
-       u16 device_id;
-       u16 active_tx, active_rx;
-       u8 partner_supported_crc;
-       u8 remote_tx_rate;
-       u8 device_rev;
-
-       set_link_state(ppd, HLS_VERIFY_CAP);
-
-       lcb_shutdown(dd, 0);
-       adjust_lcb_for_fpga_serdes(dd);
-
-       /*
-        * These are now valid:
-        *      remote VerifyCap fields in the general LNI config
-        *      CSR DC8051_STS_REMOTE_GUID
-        *      CSR DC8051_STS_REMOTE_NODE_TYPE
-        *      CSR DC8051_STS_REMOTE_FM_SECURITY
-        *      CSR DC8051_STS_REMOTE_PORT_NO
-        */
-
-       read_vc_remote_phy(dd, &power_management, &continious);
-       read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
-                             &partner_supported_crc);
-       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
-       read_remote_device_id(dd, &device_id, &device_rev);
-       /*
-        * And the 'MgmtAllowed' information, which is exchanged during
-        * LNI, is also be available at this point.
-        */
-       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
-       /* print the active widths */
-       get_link_widths(dd, &active_tx, &active_rx);
-       dd_dev_info(dd,
-                   "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
-                   (int)power_management, (int)continious);
-       dd_dev_info(dd,
-                   "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
-                   (int)vau, (int)z, (int)vcu, (int)vl15buf,
-                   (int)partner_supported_crc);
-       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
-                   (u32)remote_tx_rate, (u32)link_widths);
-       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
-                   (u32)device_id, (u32)device_rev);
-       /*
-        * The peer vAU value just read is the peer receiver value.  HFI does
-        * not support a transmit vAU of 0 (AU == 8).  We advertised that
-        * with Z=1 in the fabric capabilities sent to the peer.  The peer
-        * will see our Z=1, and, if it advertised a vAU of 0, will move its
-        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
-        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
-        * subject to the Z value exception.
-        */
-       if (vau == 0)
-               vau = 1;
-       set_up_vl15(dd, vau, vl15buf);
-
-       /* set up the LCB CRC mode */
-       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
-
-       /* order is important: use the lowest bit in common */
-       if (crc_mask & CAP_CRC_14B)
-               crc_val = LCB_CRC_14B;
-       else if (crc_mask & CAP_CRC_48B)
-               crc_val = LCB_CRC_48B;
-       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
-               crc_val = LCB_CRC_12B_16B_PER_LANE;
-       else
-               crc_val = LCB_CRC_16B;
-
-       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
-       write_csr(dd, DC_LCB_CFG_CRC_MODE,
-                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
-
-       /* set (14b only) or clear sideband credit */
-       reg = read_csr(dd, SEND_CM_CTRL);
-       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
-               write_csr(dd, SEND_CM_CTRL,
-                         reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
-       } else {
-               write_csr(dd, SEND_CM_CTRL,
-                         reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
-       }
-
-       ppd->link_speed_active = 0;     /* invalid value */
-       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
-               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
-               switch (remote_tx_rate) {
-               case 0:
-                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
-                       break;
-               case 1:
-                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
-                       break;
-               }
-       } else {
-               /* actual rate is highest bit of the ANDed rates */
-               u8 rate = remote_tx_rate & ppd->local_tx_rate;
-
-               if (rate & 2)
-                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
-               else if (rate & 1)
-                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
-       }
-       if (ppd->link_speed_active == 0) {
-               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
-                          __func__, (int)remote_tx_rate);
-               ppd->link_speed_active = OPA_LINK_SPEED_25G;
-       }
-
-       /*
-        * Cache the values of the supported, enabled, and active
-        * LTP CRC modes to return in 'portinfo' queries. But the bit
-        * flags that are returned in the portinfo query differ from
-        * what's in the link_crc_mask, crc_sizes, and crc_val
-        * variables. Convert these here.
-        */
-       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
-               /* supported crc modes */
-       ppd->port_ltp_crc_mode |=
-               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
-               /* enabled crc modes */
-       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
-               /* active crc mode */
-
-       /* set up the remote credit return table */
-       assign_remote_cm_au_table(dd, vcu);
-
-       /*
-        * The LCB is reset on entry to handle_verify_cap(), so this must
-        * be applied on every link up.
-        *
-        * Adjust LCB error kill enable to kill the link if
-        * these RBUF errors are seen:
-        *      REPLAY_BUF_MBE_SMASK
-        *      FLIT_INPUT_BUF_MBE_SMASK
-        */
-       if (is_ax(dd)) {                        /* fixed in B0 */
-               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
-               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
-                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
-               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
-       }
-
-       /* pull LCB fifos out of reset - all fifo clocks must be stable */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
-
-       /* give 8051 access to the LCB CSRs */
-       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
-       set_8051_lcb_access(dd);
-
-       ppd->neighbor_guid =
-               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
-       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
-                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
-       ppd->neighbor_type =
-               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
-               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
-       ppd->neighbor_fm_security =
-               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
-               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
-       dd_dev_info(dd,
-                   "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
-                   ppd->neighbor_guid, ppd->neighbor_type,
-                   ppd->mgmt_allowed, ppd->neighbor_fm_security);
-       if (ppd->mgmt_allowed)
-               add_full_mgmt_pkey(ppd);
-
-       /* tell the 8051 to go to LinkUp */
-       set_link_state(ppd, HLS_GOING_UP);
-}
-
-/*
- * Apply the link width downgrade enabled policy against the current active
- * link widths.
- *
- * Called when the enabled policy changes or the active link widths change.
- */
-void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
-{
-       int do_bounce = 0;
-       int tries;
-       u16 lwde;
-       u16 tx, rx;
-
-       /* use the hls lock to avoid a race with actual link up */
-       tries = 0;
-retry:
-       mutex_lock(&ppd->hls_lock);
-       /* only apply if the link is up */
-       if (!(ppd->host_link_state & HLS_UP)) {
-               /* still going up..wait and retry */
-               if (ppd->host_link_state & HLS_GOING_UP) {
-                       if (++tries < 1000) {
-                               mutex_unlock(&ppd->hls_lock);
-                               usleep_range(100, 120); /* arbitrary */
-                               goto retry;
-                       }
-                       dd_dev_err(ppd->dd,
-                                  "%s: giving up waiting for link state change\n",
-                                  __func__);
-               }
-               goto done;
-       }
-
-       lwde = ppd->link_width_downgrade_enabled;
-
-       if (refresh_widths) {
-               get_link_widths(ppd->dd, &tx, &rx);
-               ppd->link_width_downgrade_tx_active = tx;
-               ppd->link_width_downgrade_rx_active = rx;
-       }
-
-       if (ppd->link_width_downgrade_tx_active == 0 ||
-           ppd->link_width_downgrade_rx_active == 0) {
-               /* the 8051 reported a dead link as a downgrade */
-               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
-       } else if (lwde == 0) {
-               /* downgrade is disabled */
-
-               /* bounce if not at starting active width */
-               if ((ppd->link_width_active !=
-                    ppd->link_width_downgrade_tx_active) ||
-                   (ppd->link_width_active !=
-                    ppd->link_width_downgrade_rx_active)) {
-                       dd_dev_err(ppd->dd,
-                                  "Link downgrade is disabled and link has downgraded, downing link\n");
-                       dd_dev_err(ppd->dd,
-                                  "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
-                                  ppd->link_width_active,
-                                  ppd->link_width_downgrade_tx_active,
-                                  ppd->link_width_downgrade_rx_active);
-                       do_bounce = 1;
-               }
-       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
-                  (lwde & ppd->link_width_downgrade_rx_active) == 0) {
-               /* Tx or Rx is outside the enabled policy */
-               dd_dev_err(ppd->dd,
-                          "Link is outside of downgrade allowed, downing link\n");
-               dd_dev_err(ppd->dd,
-                          "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
-                          lwde, ppd->link_width_downgrade_tx_active,
-                          ppd->link_width_downgrade_rx_active);
-               do_bounce = 1;
-       }
-
-done:
-       mutex_unlock(&ppd->hls_lock);
-
-       if (do_bounce) {
-               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
-                                    OPA_LINKDOWN_REASON_WIDTH_POLICY);
-               set_link_state(ppd, HLS_DN_OFFLINE);
-               tune_serdes(ppd);
-               start_link(ppd);
-       }
-}
-
-/*
- * Handle a link downgrade interrupt from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
- */
-void handle_link_downgrade(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       link_downgrade_work);
-
-       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
-       apply_link_downgrade_policy(ppd, 1);
-}
-
-static char *dcc_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dcc_err_flags,
-               ARRAY_SIZE(dcc_err_flags));
-}
-
-static char *lcb_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, lcb_err_flags,
-               ARRAY_SIZE(lcb_err_flags));
-}
-
-static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_err_flags,
-               ARRAY_SIZE(dc8051_err_flags));
-}
-
-static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
-               ARRAY_SIZE(dc8051_info_err_flags));
-}
-
-static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
-{
-       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
-               ARRAY_SIZE(dc8051_info_host_msg_flags));
-}
-
-static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 info, err, host_msg;
-       int queue_link_down = 0;
-       char buf[96];
-
-       /* look at the flags */
-       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
-               /* 8051 information set by firmware */
-               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
-               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
-               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
-                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
-               host_msg = (info >>
-                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
-                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
-
-               /*
-                * Handle error flags.
-                */
-               if (err & FAILED_LNI) {
-                       /*
-                        * LNI error indications are cleared by the 8051
-                        * only when starting polling.  Only pay attention
-                        * to them when in the states that occur during
-                        * LNI.
-                        */
-                       if (ppd->host_link_state
-                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
-                               queue_link_down = 1;
-                               dd_dev_info(dd, "Link error: %s\n",
-                                           dc8051_info_err_string(buf,
-                                                                  sizeof(buf),
-                                                                  err &
-                                                                  FAILED_LNI));
-                       }
-                       err &= ~(u64)FAILED_LNI;
-               }
-               /* unknown frames can happen durning LNI, just count */
-               if (err & UNKNOWN_FRAME) {
-                       ppd->unknown_frame_count++;
-                       err &= ~(u64)UNKNOWN_FRAME;
-               }
-               if (err) {
-                       /* report remaining errors, but do not do anything */
-                       dd_dev_err(dd, "8051 info error: %s\n",
-                                  dc8051_info_err_string(buf, sizeof(buf),
-                                                         err));
-               }
-
-               /*
-                * Handle host message flags.
-                */
-               if (host_msg & HOST_REQ_DONE) {
-                       /*
-                        * Presently, the driver does a busy wait for
-                        * host requests to complete.  This is only an
-                        * informational message.
-                        * NOTE: The 8051 clears the host message
-                        * information *on the next 8051 command*.
-                        * Therefore, when linkup is achieved,
-                        * this flag will still be set.
-                        */
-                       host_msg &= ~(u64)HOST_REQ_DONE;
-               }
-               if (host_msg & BC_SMA_MSG) {
-                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
-                       host_msg &= ~(u64)BC_SMA_MSG;
-               }
-               if (host_msg & LINKUP_ACHIEVED) {
-                       dd_dev_info(dd, "8051: Link up\n");
-                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
-                       host_msg &= ~(u64)LINKUP_ACHIEVED;
-               }
-               if (host_msg & EXT_DEVICE_CFG_REQ) {
-                       handle_8051_request(ppd);
-                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
-               }
-               if (host_msg & VERIFY_CAP_FRAME) {
-                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
-                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
-               }
-               if (host_msg & LINK_GOING_DOWN) {
-                       const char *extra = "";
-                       /* no downgrade action needed if going down */
-                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
-                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
-                               extra = " (ignoring downgrade)";
-                       }
-                       dd_dev_info(dd, "8051: Link down%s\n", extra);
-                       queue_link_down = 1;
-                       host_msg &= ~(u64)LINK_GOING_DOWN;
-               }
-               if (host_msg & LINK_WIDTH_DOWNGRADED) {
-                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
-                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
-               }
-               if (host_msg) {
-                       /* report remaining messages, but do not do anything */
-                       dd_dev_info(dd, "8051 info host message: %s\n",
-                                   dc8051_info_host_msg_string(buf,
-                                                               sizeof(buf),
-                                                               host_msg));
-               }
-
-               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
-       }
-       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
-               /*
-                * Lost the 8051 heartbeat.  If this happens, we
-                * receive constant interrupts about it.  Disable
-                * the interrupt after the first.
-                */
-               dd_dev_err(dd, "Lost 8051 heartbeat\n");
-               write_csr(dd, DC_DC8051_ERR_EN,
-                         read_csr(dd, DC_DC8051_ERR_EN) &
-                         ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
-
-               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
-       }
-       if (reg) {
-               /* report the error, but do not do anything */
-               dd_dev_err(dd, "8051 error: %s\n",
-                          dc8051_err_string(buf, sizeof(buf), reg));
-       }
-
-       if (queue_link_down) {
-               /*
-                * if the link is already going down or disabled, do not
-                * queue another
-                */
-               if ((ppd->host_link_state &
-                   (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
-                   ppd->link_enabled == 0) {
-                       dd_dev_info(dd, "%s: not queuing link down\n",
-                                   __func__);
-               } else {
-                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
-               }
-       }
-}
-
-static const char * const fm_config_txt[] = {
-[0] =
-       "BadHeadDist: Distance violation between two head flits",
-[1] =
-       "BadTailDist: Distance violation between two tail flits",
-[2] =
-       "BadCtrlDist: Distance violation between two credit control flits",
-[3] =
-       "BadCrdAck: Credits return for unsupported VL",
-[4] =
-       "UnsupportedVLMarker: Received VL Marker",
-[5] =
-       "BadPreempt: Exceeded the preemption nesting level",
-[6] =
-       "BadControlFlit: Received unsupported control flit",
-/* no 7 */
-[8] =
-       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
-};
-
-static const char * const port_rcv_txt[] = {
-[1] =
-       "BadPktLen: Illegal PktLen",
-[2] =
-       "PktLenTooLong: Packet longer than PktLen",
-[3] =
-       "PktLenTooShort: Packet shorter than PktLen",
-[4] =
-       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
-[5] =
-       "BadDLID: Illegal DLID (0, doesn't match HFI)",
-[6] =
-       "BadL2: Illegal L2 opcode",
-[7] =
-       "BadSC: Unsupported SC",
-[9] =
-       "BadRC: Illegal RC",
-[11] =
-       "PreemptError: Preempting with same VL",
-[12] =
-       "PreemptVL15: Preempting a VL15 packet",
-};
-
-#define OPA_LDR_FMCONFIG_OFFSET 16
-#define OPA_LDR_PORTRCV_OFFSET 0
-static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       u64 info, hdr0, hdr1;
-       const char *extra;
-       char buf[96];
-       struct hfi1_pportdata *ppd = dd->pport;
-       u8 lcl_reason = 0;
-       int do_bounce = 0;
-
-       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
-               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
-                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
-                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
-               }
-               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
-               struct hfi1_pportdata *ppd = dd->pport;
-               /* this counter saturates at (2^32) - 1 */
-               if (ppd->link_downed < (u32)UINT_MAX)
-                       ppd->link_downed++;
-               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
-               u8 reason_valid = 1;
-
-               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
-               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
-                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
-               }
-               switch (info) {
-               case 0:
-               case 1:
-               case 2:
-               case 3:
-               case 4:
-               case 5:
-               case 6:
-                       extra = fm_config_txt[info];
-                       break;
-               case 8:
-                       extra = fm_config_txt[info];
-                       if (ppd->port_error_action &
-                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
-                               do_bounce = 1;
-                               /*
-                                * lcl_reason cannot be derived from info
-                                * for this error
-                                */
-                               lcl_reason =
-                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
-                       }
-                       break;
-               default:
-                       reason_valid = 0;
-                       snprintf(buf, sizeof(buf), "reserved%lld", info);
-                       extra = buf;
-                       break;
-               }
-
-               if (reason_valid && !do_bounce) {
-                       do_bounce = ppd->port_error_action &
-                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
-                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
-               }
-
-               /* just report this */
-               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
-               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
-               u8 reason_valid = 1;
-
-               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
-               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
-               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
-               if (!(dd->err_info_rcvport.status_and_code &
-                     OPA_EI_STATUS_SMASK)) {
-                       dd->err_info_rcvport.status_and_code =
-                               info & OPA_EI_CODE_SMASK;
-                       /* set status bit */
-                       dd->err_info_rcvport.status_and_code |=
-                               OPA_EI_STATUS_SMASK;
-                       /*
-                        * save first 2 flits in the packet that caused
-                        * the error
-                        */
-                        dd->err_info_rcvport.packet_flit1 = hdr0;
-                        dd->err_info_rcvport.packet_flit2 = hdr1;
-               }
-               switch (info) {
-               case 1:
-               case 2:
-               case 3:
-               case 4:
-               case 5:
-               case 6:
-               case 7:
-               case 9:
-               case 11:
-               case 12:
-                       extra = port_rcv_txt[info];
-                       break;
-               default:
-                       reason_valid = 0;
-                       snprintf(buf, sizeof(buf), "reserved%lld", info);
-                       extra = buf;
-                       break;
-               }
-
-               if (reason_valid && !do_bounce) {
-                       do_bounce = ppd->port_error_action &
-                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
-                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
-               }
-
-               /* just report this */
-               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
-               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
-                           hdr0, hdr1);
-
-               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
-       }
-
-       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
-               /* informative only */
-               dd_dev_info(dd, "8051 access to LCB blocked\n");
-               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
-       }
-       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
-               /* informative only */
-               dd_dev_info(dd, "host access to LCB blocked\n");
-               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
-       }
-
-       /* report any remaining errors */
-       if (reg)
-               dd_dev_info(dd, "DCC Error: %s\n",
-                           dcc_err_string(buf, sizeof(buf), reg));
-
-       if (lcl_reason == 0)
-               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
-
-       if (do_bounce) {
-               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
-               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
-               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
-       }
-}
-
-static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
-{
-       char buf[96];
-
-       dd_dev_info(dd, "LCB Error: %s\n",
-                   lcb_err_string(buf, sizeof(buf), reg));
-}
-
-/*
- * CCE block DC interrupt.  Source is < 8.
- */
-static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct err_reg_info *eri = &dc_errs[source];
-
-       if (eri->handler) {
-               interrupt_clear_down(dd, 0, eri);
-       } else if (source == 3 /* dc_lbm_int */) {
-               /*
-                * This indicates that a parity error has occurred on the
-                * address/control lines presented to the LBM.  The error
-                * is a single pulse, there is no associated error flag,
-                * and it is non-maskable.  This is because if a parity
-                * error occurs on the request the request is dropped.
-                * This should never occur, but it is nice to know if it
-                * ever does.
-                */
-               dd_dev_err(dd, "Parity error in DC LBM block\n");
-       } else {
-               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
-       }
-}
-
-/*
- * TX block send credit interrupt.  Source is < 160.
- */
-static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       sc_group_release_update(dd, source);
-}
-
-/*
- * TX block SDMA interrupt.  Source is < 48.
- *
- * SDMA interrupts are grouped by type:
- *
- *      0 -  N-1 = SDma
- *      N - 2N-1 = SDmaProgress
- *     2N - 3N-1 = SDmaIdle
- */
-static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       /* what interrupt */
-       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
-       /* which engine */
-       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       sdma_dumpstate(&dd->per_sdma[which]);
-#endif
-
-       if (likely(what < 3 && which < dd->num_sdma)) {
-               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
-       } else {
-               /* should not happen */
-               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
-       }
-}
-
-/*
- * RX block receive available interrupt.  Source is < 160.
- */
-static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       struct hfi1_ctxtdata *rcd;
-       char *err_detail;
-
-       if (likely(source < dd->num_rcv_contexts)) {
-               rcd = dd->rcd[source];
-               if (rcd) {
-                       if (source < dd->first_user_ctxt)
-                               rcd->do_interrupt(rcd, 0);
-                       else
-                               handle_user_interrupt(rcd);
-                       return; /* OK */
-               }
-               /* received an interrupt, but no rcd */
-               err_detail = "dataless";
-       } else {
-               /* received an interrupt, but are not using that context */
-               err_detail = "out of range";
-       }
-       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
-                  err_detail, source);
-}
-
-/*
- * RX block receive urgent interrupt.  Source is < 160.
- */
-static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       struct hfi1_ctxtdata *rcd;
-       char *err_detail;
-
-       if (likely(source < dd->num_rcv_contexts)) {
-               rcd = dd->rcd[source];
-               if (rcd) {
-                       /* only pay attention to user urgent interrupts */
-                       if (source >= dd->first_user_ctxt)
-                               handle_user_interrupt(rcd);
-                       return; /* OK */
-               }
-               /* received an interrupt, but no rcd */
-               err_detail = "dataless";
-       } else {
-               /* received an interrupt, but are not using that context */
-               err_detail = "out of range";
-       }
-       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
-                  err_detail, source);
-}
-
-/*
- * Reserved range interrupt.  Should not be called in normal operation.
- */
-static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
-{
-       char name[64];
-
-       dd_dev_err(dd, "unexpected %s interrupt\n",
-                  is_reserved_name(name, sizeof(name), source));
-}
-
-static const struct is_table is_table[] = {
-/*
- * start                end
- *                             name func               interrupt func
- */
-{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
-                               is_misc_err_name,       is_misc_err_int },
-{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
-                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
-{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
-                               is_sendctxt_err_name,   is_sendctxt_err_int },
-{ IS_SDMA_START,            IS_SDMA_END,
-                               is_sdma_eng_name,       is_sdma_eng_int },
-{ IS_VARIOUS_START,         IS_VARIOUS_END,
-                               is_various_name,        is_various_int },
-{ IS_DC_START,      IS_DC_END,
-                               is_dc_name,             is_dc_int },
-{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
-                               is_rcv_avail_name,      is_rcv_avail_int },
-{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
-                               is_rcv_urgent_name,     is_rcv_urgent_int },
-{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
-                               is_send_credit_name,    is_send_credit_int},
-{ IS_RESERVED_START,     IS_RESERVED_END,
-                               is_reserved_name,       is_reserved_int},
-};
-
-/*
- * Interrupt source interrupt - called when the given source has an interrupt.
- * Source is a bit index into an array of 64-bit integers.
- */
-static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
-{
-       const struct is_table *entry;
-
-       /* avoids a double compare by walking the table in-order */
-       for (entry = &is_table[0]; entry->is_name; entry++) {
-               if (source < entry->end) {
-                       trace_hfi1_interrupt(dd, entry, source);
-                       entry->is_int(dd, source - entry->start);
-                       return;
-               }
-       }
-       /* fell off the end */
-       dd_dev_err(dd, "invalid interrupt source %u\n", source);
-}
-
-/*
- * General interrupt handler.  This is able to correctly handle
- * all interrupts in case INTx is used.
- */
-static irqreturn_t general_interrupt(int irq, void *data)
-{
-       struct hfi1_devdata *dd = data;
-       u64 regs[CCE_NUM_INT_CSRS];
-       u32 bit;
-       int i;
-
-       this_cpu_inc(*dd->int_counter);
-
-       /* phase 1: scan and clear all handled interrupts */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-               if (dd->gi_mask[i] == 0) {
-                       regs[i] = 0;    /* used later */
-                       continue;
-               }
-               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
-                               dd->gi_mask[i];
-               /* only clear if anything is set */
-               if (regs[i])
-                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
-       }
-
-       /* phase 2: call the appropriate handler */
-       for_each_set_bit(bit, (unsigned long *)&regs[0],
-                        CCE_NUM_INT_CSRS * 64) {
-               is_interrupt(dd, bit);
-       }
-
-       return IRQ_HANDLED;
-}
-
-static irqreturn_t sdma_interrupt(int irq, void *data)
-{
-       struct sdma_engine *sde = data;
-       struct hfi1_devdata *dd = sde->dd;
-       u64 status;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       sdma_dumpstate(sde);
-#endif
-
-       this_cpu_inc(*dd->int_counter);
-
-       /* This read_csr is really bad in the hot path */
-       status = read_csr(dd,
-                         CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
-                         & sde->imask;
-       if (likely(status)) {
-               /* clear the interrupt(s) */
-               write_csr(dd,
-                         CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
-                         status);
-
-               /* handle the interrupt(s) */
-               sdma_engine_interrupt(sde, status);
-       } else
-               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
-                          sde->this_idx);
-
-       return IRQ_HANDLED;
-}
-
-/*
- * Clear the receive interrupt.  Use a read of the interrupt clear CSR
- * to insure that the write completed.  This does NOT guarantee that
- * queued DMA writes to memory from the chip are pushed.
- */
-static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
-
-       mmiowb();       /* make sure everything before is written */
-       write_csr(dd, addr, rcd->imask);
-       /* force the above write on the chip and get a value back */
-       (void)read_csr(dd, addr);
-}
-
-/* force the receive interrupt */
-void force_recv_intr(struct hfi1_ctxtdata *rcd)
-{
-       write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
-}
-
-/*
- * Return non-zero if a packet is present.
- *
- * This routine is called when rechecking for packets after the RcvAvail
- * interrupt has been cleared down.  First, do a quick check of memory for
- * a packet present.  If not found, use an expensive CSR read of the context
- * tail to determine the actual tail.  The CSR read is necessary because there
- * is no method to push pending DMAs to memory other than an interrupt and we
- * are trying to determine if we need to force an interrupt.
- */
-static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
-{
-       u32 tail;
-       int present;
-
-       if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
-               present = (rcd->seq_cnt ==
-                               rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
-       else /* is RDMA rtail */
-               present = (rcd->head != get_rcvhdrtail(rcd));
-
-       if (present)
-               return 1;
-
-       /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
-       tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
-       return rcd->head != tail;
-}
-
-/*
- * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
- * This routine will try to handle packets immediately (latency), but if
- * it finds too many, it will invoke the thread handler (bandwitdh).  The
- * chip receive interrupt is *not* cleared down until this or the thread (if
- * invoked) is finished.  The intent is to avoid extra interrupts while we
- * are processing packets anyway.
- */
-static irqreturn_t receive_context_interrupt(int irq, void *data)
-{
-       struct hfi1_ctxtdata *rcd = data;
-       struct hfi1_devdata *dd = rcd->dd;
-       int disposition;
-       int present;
-
-       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
-       this_cpu_inc(*dd->int_counter);
-       aspm_ctx_disable(rcd);
-
-       /* receive interrupt remains blocked while processing packets */
-       disposition = rcd->do_interrupt(rcd, 0);
-
-       /*
-        * Too many packets were seen while processing packets in this
-        * IRQ handler.  Invoke the handler thread.  The receive interrupt
-        * remains blocked.
-        */
-       if (disposition == RCV_PKT_LIMIT)
-               return IRQ_WAKE_THREAD;
-
-       /*
-        * The packet processor detected no more packets.  Clear the receive
-        * interrupt and recheck for a packet packet that may have arrived
-        * after the previous check and interrupt clear.  If a packet arrived,
-        * force another interrupt.
-        */
-       clear_recv_intr(rcd);
-       present = check_packet_present(rcd);
-       if (present)
-               force_recv_intr(rcd);
-
-       return IRQ_HANDLED;
-}
-
-/*
- * Receive packet thread handler.  This expects to be invoked with the
- * receive interrupt still blocked.
- */
-static irqreturn_t receive_context_thread(int irq, void *data)
-{
-       struct hfi1_ctxtdata *rcd = data;
-       int present;
-
-       /* receive interrupt is still blocked from the IRQ handler */
-       (void)rcd->do_interrupt(rcd, 1);
-
-       /*
-        * The packet processor will only return if it detected no more
-        * packets.  Hold IRQs here so we can safely clear the interrupt and
-        * recheck for a packet that may have arrived after the previous
-        * check and the interrupt clear.  If a packet arrived, force another
-        * interrupt.
-        */
-       local_irq_disable();
-       clear_recv_intr(rcd);
-       present = check_packet_present(rcd);
-       if (present)
-               force_recv_intr(rcd);
-       local_irq_enable();
-
-       return IRQ_HANDLED;
-}
-
-/* ========================================================================= */
-
-u32 read_physical_state(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
-       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
-                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
-}
-
-u32 read_logical_state(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
-       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
-                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
-}
-
-static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
-{
-       u64 reg;
-
-       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
-       /* clear current state, set new state */
-       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
-       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
-       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
-}
-
-/*
- * Use the 8051 to read a LCB CSR.
- */
-static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
-{
-       u32 regno;
-       int ret;
-
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               if (acquire_lcb_access(dd, 0) == 0) {
-                       *data = read_csr(dd, addr);
-                       release_lcb_access(dd, 0);
-                       return 0;
-               }
-               return -EBUSY;
-       }
-
-       /* register is an index of LCB registers: (offset - base) / 8 */
-       regno = (addr - DC_LCB_CFG_RUN) >> 3;
-       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
-       if (ret != HCMD_SUCCESS)
-               return -EBUSY;
-       return 0;
-}
-
-/*
- * Read an LCB CSR.  Access may not be in host control, so check.
- * Return 0 on success, -EBUSY on failure.
- */
-int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       /* if up, go through the 8051 for the value */
-       if (ppd->host_link_state & HLS_UP)
-               return read_lcb_via_8051(dd, addr, data);
-       /* if going up or down, no access */
-       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
-               return -EBUSY;
-       /* otherwise, host has access */
-       *data = read_csr(dd, addr);
-       return 0;
-}
-
-/*
- * Use the 8051 to write a LCB CSR.
- */
-static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
-{
-       u32 regno;
-       int ret;
-
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
-           (dd->dc8051_ver < dc8051_ver(0, 20))) {
-               if (acquire_lcb_access(dd, 0) == 0) {
-                       write_csr(dd, addr, data);
-                       release_lcb_access(dd, 0);
-                       return 0;
-               }
-               return -EBUSY;
-       }
-
-       /* register is an index of LCB registers: (offset - base) / 8 */
-       regno = (addr - DC_LCB_CFG_RUN) >> 3;
-       ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
-       if (ret != HCMD_SUCCESS)
-               return -EBUSY;
-       return 0;
-}
-
-/*
- * Write an LCB CSR.  Access may not be in host control, so check.
- * Return 0 on success, -EBUSY on failure.
- */
-int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       /* if up, go through the 8051 for the value */
-       if (ppd->host_link_state & HLS_UP)
-               return write_lcb_via_8051(dd, addr, data);
-       /* if going up or down, no access */
-       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
-               return -EBUSY;
-       /* otherwise, host has access */
-       write_csr(dd, addr, data);
-       return 0;
-}
-
-/*
- * Returns:
- *     < 0 = Linux error, not able to get access
- *     > 0 = 8051 command RETURN_CODE
- */
-static int do_8051_command(
-       struct hfi1_devdata *dd,
-       u32 type,
-       u64 in_data,
-       u64 *out_data)
-{
-       u64 reg, completed;
-       int return_code;
-       unsigned long flags;
-       unsigned long timeout;
-
-       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
-
-       /*
-        * Alternative to holding the lock for a long time:
-        * - keep busy wait - have other users bounce off
-        */
-       spin_lock_irqsave(&dd->dc8051_lock, flags);
-
-       /* We can't send any commands to the 8051 if it's in reset */
-       if (dd->dc_shutdown) {
-               return_code = -ENODEV;
-               goto fail;
-       }
-
-       /*
-        * If an 8051 host command timed out previously, then the 8051 is
-        * stuck.
-        *
-        * On first timeout, attempt to reset and restart the entire DC
-        * block (including 8051). (Is this too big of a hammer?)
-        *
-        * If the 8051 times out a second time, the reset did not bring it
-        * back to healthy life. In that case, fail any subsequent commands.
-        */
-       if (dd->dc8051_timed_out) {
-               if (dd->dc8051_timed_out > 1) {
-                       dd_dev_err(dd,
-                                  "Previous 8051 host command timed out, skipping command %u\n",
-                                  type);
-                       return_code = -ENXIO;
-                       goto fail;
-               }
-               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-               dc_shutdown(dd);
-               dc_start(dd);
-               spin_lock_irqsave(&dd->dc8051_lock, flags);
-       }
-
-       /*
-        * If there is no timeout, then the 8051 command interface is
-        * waiting for a command.
-        */
-
-       /*
-        * When writing a LCB CSR, out_data contains the full value to
-        * to be written, while in_data contains the relative LCB
-        * address in 7:0.  Do the work here, rather than the caller,
-        * of distrubting the write data to where it needs to go:
-        *
-        * Write data
-        *   39:00 -> in_data[47:8]
-        *   47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
-        *   63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
-        */
-       if (type == HCMD_WRITE_LCB_CSR) {
-               in_data |= ((*out_data) & 0xffffffffffull) << 8;
-               reg = ((((*out_data) >> 40) & 0xff) <<
-                               DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
-                     | ((((*out_data) >> 48) & 0xffff) <<
-                               DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
-               write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
-       }
-
-       /*
-        * Do two writes: the first to stabilize the type and req_data, the
-        * second to activate.
-        */
-       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
-                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
-               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
-                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
-       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
-
-       /* wait for completion, alternate: interrupt */
-       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
-               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
-               if (completed)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd->dc8051_timed_out++;
-                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
-                       if (out_data)
-                               *out_data = 0;
-                       return_code = -ETIMEDOUT;
-                       goto fail;
-               }
-               udelay(2);
-       }
-
-       if (out_data) {
-               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
-                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
-               if (type == HCMD_READ_LCB_CSR) {
-                       /* top 16 bits are in a different register */
-                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
-                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
-                               << (48
-                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
-               }
-       }
-       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
-                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
-       dd->dc8051_timed_out = 0;
-       /*
-        * Clear command for next user.
-        */
-       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
-
-fail:
-       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
-
-       return return_code;
-}
-
-static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
-{
-       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
-}
-
-int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
-                    u8 lane_id, u32 config_data)
-{
-       u64 data;
-       int ret;
-
-       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
-               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
-               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
-       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd,
-                          "load 8051 config: field id %d, lane %d, err %d\n",
-                          (int)field_id, (int)lane_id, ret);
-       }
-       return ret;
-}
-
-/*
- * Read the 8051 firmware "registers".  Use the RAM directly.  Always
- * set the result, even on error.
- * Return 0 on success, -errno on failure
- */
-int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
-                    u32 *result)
-{
-       u64 big_data;
-       u32 addr;
-       int ret;
-
-       /* address start depends on the lane_id */
-       if (lane_id < 4)
-               addr = (4 * NUM_GENERAL_FIELDS)
-                       + (lane_id * 4 * NUM_LANE_FIELDS);
-       else
-               addr = 0;
-       addr += field_id * 4;
-
-       /* read is in 8-byte chunks, hardware will truncate the address down */
-       ret = read_8051_data(dd, addr, 8, &big_data);
-
-       if (ret == 0) {
-               /* extract the 4 bytes we want */
-               if (addr & 0x4)
-                       *result = (u32)(big_data >> 32);
-               else
-                       *result = (u32)big_data;
-       } else {
-               *result = 0;
-               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
-                          __func__, lane_id, field_id);
-       }
-
-       return ret;
-}
-
-static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
-                             u8 continuous)
-{
-       u32 frame;
-
-       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
-               | power_management << POWER_MANAGEMENT_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
-                               GENERAL_CONFIG, frame);
-}
-
-static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
-                                u16 vl15buf, u8 crc_sizes)
-{
-       u32 frame;
-
-       frame = (u32)vau << VAU_SHIFT
-               | (u32)z << Z_SHIFT
-               | (u32)vcu << VCU_SHIFT
-               | (u32)vl15buf << VL15BUF_SHIFT
-               | (u32)crc_sizes << CRC_SIZES_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
-                               GENERAL_CONFIG, frame);
-}
-
-static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
-                                    u8 *flag_bits, u16 *link_widths)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
-                        &frame);
-       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
-       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
-       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
-}
-
-static int write_vc_local_link_width(struct hfi1_devdata *dd,
-                                    u8 misc_bits,
-                                    u8 flag_bits,
-                                    u16 link_widths)
-{
-       u32 frame;
-
-       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
-               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
-               | (u32)link_widths << LINK_WIDTH_SHIFT;
-       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
-                    frame);
-}
-
-static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
-                                u8 device_rev)
-{
-       u32 frame;
-
-       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
-               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
-       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
-}
-
-static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
-                                 u8 *device_rev)
-{
-       u32 frame;
-
-       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
-       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
-       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
-                       & REMOTE_DEVICE_REV_MASK;
-}
-
-void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
-{
-       u32 frame;
-
-       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
-       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
-       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
-}
-
-static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
-                              u8 *continuous)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
-       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
-                                       & POWER_MANAGEMENT_MASK;
-       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
-                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
-}
-
-static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
-                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
-       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
-       *z = (frame >> Z_SHIFT) & Z_MASK;
-       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
-       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
-       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
-}
-
-static void read_vc_remote_link_width(struct hfi1_devdata *dd,
-                                     u8 *remote_tx_rate,
-                                     u16 *link_widths)
-{
-       u32 frame;
-
-       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
-                        &frame);
-       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
-                               & REMOTE_TX_RATE_MASK;
-       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
-}
-
-static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
-{
-       u32 frame;
-
-       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
-       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
-}
-
-static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
-{
-       u32 frame;
-
-       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
-       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
-}
-
-static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
-{
-       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
-}
-
-static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
-{
-       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
-}
-
-void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
-{
-       u32 frame;
-       int ret;
-
-       *link_quality = 0;
-       if (dd->pport->host_link_state & HLS_UP) {
-               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
-                                      &frame);
-               if (ret == 0)
-                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
-                                               & LINK_QUALITY_MASK;
-       }
-}
-
-static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
-{
-       u32 frame;
-
-       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
-       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
-}
-
-static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
-{
-       u32 frame;
-
-       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
-       *ldr = (frame & 0xff);
-}
-
-static int read_tx_settings(struct hfi1_devdata *dd,
-                           u8 *enable_lane_tx,
-                           u8 *tx_polarity_inversion,
-                           u8 *rx_polarity_inversion,
-                           u8 *max_rate)
-{
-       u32 frame;
-       int ret;
-
-       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
-       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
-                               & ENABLE_LANE_TX_MASK;
-       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
-                               & TX_POLARITY_INVERSION_MASK;
-       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
-                               & RX_POLARITY_INVERSION_MASK;
-       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
-       return ret;
-}
-
-static int write_tx_settings(struct hfi1_devdata *dd,
-                            u8 enable_lane_tx,
-                            u8 tx_polarity_inversion,
-                            u8 rx_polarity_inversion,
-                            u8 max_rate)
-{
-       u32 frame;
-
-       /* no need to mask, all variable sizes match field widths */
-       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
-               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
-               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
-               | max_rate << MAX_RATE_SHIFT;
-       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
-}
-
-static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
-{
-       u32 frame, version, prod_id;
-       int ret, lane;
-
-       /* 4 lanes */
-       for (lane = 0; lane < 4; lane++) {
-               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "Unable to read lane %d firmware details\n",
-                                  lane);
-                       continue;
-               }
-               version = (frame >> SPICO_ROM_VERSION_SHIFT)
-                                       & SPICO_ROM_VERSION_MASK;
-               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
-                                       & SPICO_ROM_PROD_ID_MASK;
-               dd_dev_info(dd,
-                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
-                           lane, version, prod_id);
-       }
-}
-
-/*
- * Read an idle LCB message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
-{
-       int ret;
-
-       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "read idle message: type %d, err %d\n",
-                          (u32)type, ret);
-               return -EINVAL;
-       }
-       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
-       /* return only the payload as we already know the type */
-       *data_out >>= IDLE_PAYLOAD_SHIFT;
-       return 0;
-}
-
-/*
- * Read an idle SMA message.  To be done in response to a notification from
- * the 8051.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
-{
-       return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
-                                data);
-}
-
-/*
- * Send an idle LCB message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-static int send_idle_message(struct hfi1_devdata *dd, u64 data)
-{
-       int ret;
-
-       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
-       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
-                          data, ret);
-               return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Send an idle SMA message.
- *
- * Returns 0 on success, -EINVAL on error
- */
-int send_idle_sma(struct hfi1_devdata *dd, u64 message)
-{
-       u64 data;
-
-       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
-               ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
-       return send_idle_message(dd, data);
-}
-
-/*
- * Initialize the LCB then do a quick link up.  This may or may not be
- * in loopback.
- *
- * return 0 on success, -errno on error
- */
-static int do_quick_linkup(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       unsigned long timeout;
-       int ret;
-
-       lcb_shutdown(dd, 0);
-
-       if (loopback) {
-               /* LCB_CFG_LOOPBACK.VAL = 2 */
-               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
-               write_csr(dd, DC_LCB_CFG_LOOPBACK,
-                         IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
-               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
-       }
-
-       /* start the LCBs */
-       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
-
-       /* simulator only loopback steps */
-       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               /* LCB_CFG_RUN.EN = 1 */
-               write_csr(dd, DC_LCB_CFG_RUN,
-                         1ull << DC_LCB_CFG_RUN_EN_SHIFT);
-
-               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
-               timeout = jiffies + msecs_to_jiffies(10);
-               while (1) {
-                       reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
-                       if (reg)
-                               break;
-                       if (time_after(jiffies, timeout)) {
-                               dd_dev_err(dd,
-                                          "timeout waiting for LINK_TRANSFER_ACTIVE\n");
-                               return -ETIMEDOUT;
-                       }
-                       udelay(2);
-               }
-
-               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
-                         1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
-       }
-
-       if (!loopback) {
-               /*
-                * When doing quick linkup and not in loopback, both
-                * sides must be done with LCB set-up before either
-                * starts the quick linkup.  Put a delay here so that
-                * both sides can be started and have a chance to be
-                * done with LCB set up before resuming.
-                */
-               dd_dev_err(dd,
-                          "Pausing for peer to be finished with LCB set up\n");
-               msleep(5000);
-               dd_dev_err(dd, "Continuing with quick linkup\n");
-       }
-
-       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
-       set_8051_lcb_access(dd);
-
-       /*
-        * State "quick" LinkUp request sets the physical link state to
-        * LinkUp without a verify capability sequence.
-        * This state is in simulator v37 and later.
-        */
-       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
-       if (ret != HCMD_SUCCESS) {
-               dd_dev_err(dd,
-                          "%s: set physical link state to quick LinkUp failed with return %d\n",
-                          __func__, ret);
-
-               set_host_lcb_access(dd);
-               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
-
-               if (ret >= 0)
-                       ret = -EINVAL;
-               return ret;
-       }
-
-       return 0; /* success */
-}
-
-/*
- * Set the SerDes to internal loopback mode.
- * Returns 0 on success, -errno on error.
- */
-static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
-       if (ret == HCMD_SUCCESS)
-               return 0;
-       dd_dev_err(dd,
-                  "Set physical link state to SerDes Loopback failed with return %d\n",
-                  ret);
-       if (ret >= 0)
-               ret = -EINVAL;
-       return ret;
-}
-
-/*
- * Do all special steps to set up loopback.
- */
-static int init_loopback(struct hfi1_devdata *dd)
-{
-       dd_dev_info(dd, "Entering loopback mode\n");
-
-       /* all loopbacks should disable self GUID check */
-       write_csr(dd, DC_DC8051_CFG_MODE,
-                 (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
-
-       /*
-        * The simulator has only one loopback option - LCB.  Switch
-        * to that option, which includes quick link up.
-        *
-        * Accept all valid loopback values.
-        */
-       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
-           (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
-            loopback == LOOPBACK_CABLE)) {
-               loopback = LOOPBACK_LCB;
-               quick_linkup = 1;
-               return 0;
-       }
-
-       /* handle serdes loopback */
-       if (loopback == LOOPBACK_SERDES) {
-               /* internal serdes loopack needs quick linkup on RTL */
-               if (dd->icode == ICODE_RTL_SILICON)
-                       quick_linkup = 1;
-               return set_serdes_loopback_mode(dd);
-       }
-
-       /* LCB loopback - handled at poll time */
-       if (loopback == LOOPBACK_LCB) {
-               quick_linkup = 1; /* LCB is always quick linkup */
-
-               /* not supported in emulation due to emulation RTL changes */
-               if (dd->icode == ICODE_FPGA_EMULATION) {
-                       dd_dev_err(dd,
-                                  "LCB loopback not supported in emulation\n");
-                       return -EINVAL;
-               }
-               return 0;
-       }
-
-       /* external cable loopback requires no extra steps */
-       if (loopback == LOOPBACK_CABLE)
-               return 0;
-
-       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
-       return -EINVAL;
-}
-
-/*
- * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
- * used in the Verify Capability link width attribute.
- */
-static u16 opa_to_vc_link_widths(u16 opa_widths)
-{
-       int i;
-       u16 result = 0;
-
-       static const struct link_bits {
-               u16 from;
-               u16 to;
-       } opa_link_xlate[] = {
-               { OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
-               { OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
-               { OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
-               { OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
-       };
-
-       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
-               if (opa_widths & opa_link_xlate[i].from)
-                       result |= opa_link_xlate[i].to;
-       }
-       return result;
-}
-
-/*
- * Set link attributes before moving to polling.
- */
-static int set_local_link_attributes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u8 enable_lane_tx;
-       u8 tx_polarity_inversion;
-       u8 rx_polarity_inversion;
-       int ret;
-
-       /* reset our fabric serdes to clear any lingering problems */
-       fabric_serdes_reset(dd);
-
-       /* set the local tx rate - need to read-modify-write */
-       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
-                              &rx_polarity_inversion, &ppd->local_tx_rate);
-       if (ret)
-               goto set_local_link_attributes_fail;
-
-       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
-               /* set the tx rate to the fastest enabled */
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
-                       ppd->local_tx_rate = 1;
-               else
-                       ppd->local_tx_rate = 0;
-       } else {
-               /* set the tx rate to all enabled */
-               ppd->local_tx_rate = 0;
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
-                       ppd->local_tx_rate |= 2;
-               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
-                       ppd->local_tx_rate |= 1;
-       }
-
-       enable_lane_tx = 0xF; /* enable all four lanes */
-       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
-                               rx_polarity_inversion, ppd->local_tx_rate);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /*
-        * DC supports continuous updates.
-        */
-       ret = write_vc_local_phy(dd,
-                                0 /* no power management */,
-                                1 /* continuous updates */);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /* z=1 in the next call: AU of 0 is not supported by the hardware */
-       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
-                                   ppd->port_crc_mode_enabled);
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       ret = write_vc_local_link_width(dd, 0, 0,
-                                       opa_to_vc_link_widths(
-                                               ppd->link_width_enabled));
-       if (ret != HCMD_SUCCESS)
-               goto set_local_link_attributes_fail;
-
-       /* let peer know who we are */
-       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
-       if (ret == HCMD_SUCCESS)
-               return 0;
-
-set_local_link_attributes_fail:
-       dd_dev_err(dd,
-                  "Failed to set local link attributes, return 0x%x\n",
-                  ret);
-       return ret;
-}
-
-/*
- * Call this to start the link.
- * Do not do anything if the link is disabled.
- * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
- */
-int start_link(struct hfi1_pportdata *ppd)
-{
-       if (!ppd->link_enabled) {
-               dd_dev_info(ppd->dd,
-                           "%s: stopping link start because link is disabled\n",
-                           __func__);
-               return 0;
-       }
-       if (!ppd->driver_link_ready) {
-               dd_dev_info(ppd->dd,
-                           "%s: stopping link start because driver is not ready\n",
-                           __func__);
-               return 0;
-       }
-
-       return set_link_state(ppd, HLS_DN_POLL);
-}
-
-static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask;
-       unsigned long timeout;
-
-       /*
-        * Check for QSFP interrupt for t_init (SFF 8679)
-        */
-       timeout = jiffies + msecs_to_jiffies(2000);
-       while (1) {
-               mask = read_csr(dd, dd->hfi1_id ?
-                               ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-               if (!(mask & QSFP_HFI0_INT_N)) {
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
-                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
-                       break;
-               }
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
-                                   __func__);
-                       break;
-               }
-               udelay(2);
-       }
-}
-
-static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask;
-
-       mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
-       if (enable)
-               mask |= (u64)QSFP_HFI0_INT_N;
-       else
-               mask &= ~(u64)QSFP_HFI0_INT_N;
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
-}
-
-void reset_qsfp(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 mask, qsfp_mask;
-
-       /* Disable INT_N from triggering QSFP interrupts */
-       set_qsfp_int_n(ppd, 0);
-
-       /* Reset the QSFP */
-       mask = (u64)QSFP_HFI0_RESET_N;
-       qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
-       qsfp_mask |= mask;
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask);
-
-       qsfp_mask = read_csr(dd,
-                            dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
-       qsfp_mask &= ~mask;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
-
-       udelay(10);
-
-       qsfp_mask |= mask;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
-
-       wait_for_qsfp_init(ppd);
-
-       /*
-        * Allow INT_N to trigger the QSFP interrupt to watch
-        * for alarms and warnings
-        */
-       set_qsfp_int_n(ppd, 1);
-}
-
-static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
-                                       u8 *qsfp_interrupt_status)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
-           (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
-               dd_dev_info(dd, "%s: QSFP cable on fire\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
-           (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
-               dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
-           (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
-               dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
-           (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
-               dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
-                           __func__);
-
-       /* Byte 2 is vendor specific */
-
-       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
-           (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
-           (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
-           (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
-           (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
-           (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
-                           __func__);
-
-       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
-           (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
-               dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
-                           __func__);
-
-       /* Bytes 9-10 and 11-12 are reserved */
-       /* Bytes 13-15 are vendor specific */
-
-       return 0;
-}
-
-/* This routine will only be scheduled if the QSFP module present is asserted */
-void qsfp_event(struct work_struct *work)
-{
-       struct qsfp_data *qd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-
-       qd = container_of(work, struct qsfp_data, qsfp_work);
-       ppd = qd->ppd;
-       dd = ppd->dd;
-
-       /* Sanity check */
-       if (!qsfp_mod_present(ppd))
-               return;
-
-       /*
-        * Turn DC back on after cables has been
-        * re-inserted. Up until now, the DC has been in
-        * reset to save power.
-        */
-       dc_start(dd);
-
-       if (qd->cache_refresh_required) {
-               set_qsfp_int_n(ppd, 0);
-
-               wait_for_qsfp_init(ppd);
-
-               /*
-                * Allow INT_N to trigger the QSFP interrupt to watch
-                * for alarms and warnings
-                */
-               set_qsfp_int_n(ppd, 1);
-
-               tune_serdes(ppd);
-
-               start_link(ppd);
-       }
-
-       if (qd->check_interrupt_flags) {
-               u8 qsfp_interrupt_status[16] = {0,};
-
-               if (one_qsfp_read(ppd, dd->hfi1_id, 6,
-                                 &qsfp_interrupt_status[0], 16) != 16) {
-                       dd_dev_info(dd,
-                                   "%s: Failed to read status of QSFP module\n",
-                                   __func__);
-               } else {
-                       unsigned long flags;
-
-                       handle_qsfp_error_conditions(
-                                       ppd, qsfp_interrupt_status);
-                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-                       ppd->qsfp_info.check_interrupt_flags = 0;
-                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
-                                              flags);
-               }
-       }
-}
-
-static void init_qsfp_int(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-       u64 qsfp_mask, cce_int_mask;
-       const int qsfp1_int_smask = QSFP1_INT % 64;
-       const int qsfp2_int_smask = QSFP2_INT % 64;
-
-       /*
-        * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-        * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-        * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-        * the index of the appropriate CSR in the CCEIntMask CSR array
-        */
-       cce_int_mask = read_csr(dd, CCE_INT_MASK +
-                               (8 * (QSFP1_INT / 64)));
-       if (dd->hfi1_id) {
-               cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-               write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
-                         cce_int_mask);
-       } else {
-               cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-               write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
-                         cce_int_mask);
-       }
-
-       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
-       /* Clear current status to avoid spurious interrupts */
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
-                 qsfp_mask);
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
-                 qsfp_mask);
-
-       set_qsfp_int_n(ppd, 0);
-
-       /* Handle active low nature of INT_N and MODPRST_N pins */
-       if (qsfp_mod_present(ppd))
-               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
-       write_csr(dd,
-                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
-                 qsfp_mask);
-}
-
-/*
- * Do a one-time initialize of the LCB block.
- */
-static void init_lcb(struct hfi1_devdata *dd)
-{
-       /* simulator does not correctly handle LCB cclk loopback, skip */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return;
-
-       /* the DC has been reset earlier in the driver load */
-
-       /* set LCB for cclk loopback on the port */
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
-       write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
-       write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
-       write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
-       write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
-       write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
-       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
-}
-
-int bringup_serdes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 guid;
-       int ret;
-
-       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
-               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
-
-       guid = ppd->guid;
-       if (!guid) {
-               if (dd->base_guid)
-                       guid = dd->base_guid + ppd->port - 1;
-               ppd->guid = guid;
-       }
-
-       /* Set linkinit_reason on power up per OPA spec */
-       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
-
-       /* one-time init of the LCB */
-       init_lcb(dd);
-
-       if (loopback) {
-               ret = init_loopback(dd);
-               if (ret < 0)
-                       return ret;
-       }
-
-       /* tune the SERDES to a ballpark setting for
-        * optimal signal and bit error rate
-        * Needs to be done before starting the link
-        */
-       tune_serdes(ppd);
-
-       return start_link(ppd);
-}
-
-void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * Shut down the link and keep it down.   First turn off that the
-        * driver wants to allow the link to be up (driver_link_ready).
-        * Then make sure the link is not automatically restarted
-        * (link_enabled).  Cancel any pending restart.  And finally
-        * go offline.
-        */
-       ppd->driver_link_ready = 0;
-       ppd->link_enabled = 0;
-
-       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
-       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
-                            OPA_LINKDOWN_REASON_SMA_DISABLED);
-       set_link_state(ppd, HLS_DN_OFFLINE);
-
-       /* disable the port */
-       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-}
-
-static inline int init_cpu_counters(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->ibport_data.rvp.rc_acks = NULL;
-               ppd->ibport_data.rvp.rc_qacks = NULL;
-               ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
-               ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
-               ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
-               if (!ppd->ibport_data.rvp.rc_acks ||
-                   !ppd->ibport_data.rvp.rc_delayed_comp ||
-                   !ppd->ibport_data.rvp.rc_qacks)
-                       return -ENOMEM;
-       }
-
-       return 0;
-}
-
-static const char * const pt_names[] = {
-       "expected",
-       "eager",
-       "invalid"
-};
-
-static const char *pt_name(u32 type)
-{
-       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
-}
-
-/*
- * index is the index into the receive array
- */
-void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
-                 u32 type, unsigned long pa, u16 order)
-{
-       u64 reg;
-       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
-                             (dd->kregbase + RCV_ARRAY));
-
-       if (!(dd->flags & HFI1_PRESENT))
-               goto done;
-
-       if (type == PT_INVALID) {
-               pa = 0;
-       } else if (type > PT_INVALID) {
-               dd_dev_err(dd,
-                          "unexpected receive array type %u for index %u, not handled\n",
-                          type, index);
-               goto done;
-       }
-
-       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
-                 pt_name(type), index, pa, (unsigned long)order);
-
-#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
-       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
-               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
-               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
-                                       << RCV_ARRAY_RT_ADDR_SHIFT;
-       writeq(reg, base + (index * 8));
-
-       if (type == PT_EAGER)
-               /*
-                * Eager entries are written one-by-one so we have to push them
-                * after we write the entry.
-                */
-               flush_wc();
-done:
-       return;
-}
-
-void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 i;
-
-       /* this could be optimized */
-       for (i = rcd->eager_base; i < rcd->eager_base +
-                    rcd->egrbufs.alloced; i++)
-               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
-
-       for (i = rcd->expected_base;
-                       i < rcd->expected_base + rcd->expected_count; i++)
-               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
-}
-
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo)
-{
-       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
-               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
-       return 0;
-}
-
-struct hfi1_message_header *hfi1_get_msgheader(
-                               struct hfi1_devdata *dd, __le32 *rhf_addr)
-{
-       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
-
-       return (struct hfi1_message_header *)
-               (rhf_addr - dd->rhf_offset + offset);
-}
-
-static const char * const ib_cfg_name_strings[] = {
-       "HFI1_IB_CFG_LIDLMC",
-       "HFI1_IB_CFG_LWID_DG_ENB",
-       "HFI1_IB_CFG_LWID_ENB",
-       "HFI1_IB_CFG_LWID",
-       "HFI1_IB_CFG_SPD_ENB",
-       "HFI1_IB_CFG_SPD",
-       "HFI1_IB_CFG_RXPOL_ENB",
-       "HFI1_IB_CFG_LREV_ENB",
-       "HFI1_IB_CFG_LINKLATENCY",
-       "HFI1_IB_CFG_HRTBT",
-       "HFI1_IB_CFG_OP_VLS",
-       "HFI1_IB_CFG_VL_HIGH_CAP",
-       "HFI1_IB_CFG_VL_LOW_CAP",
-       "HFI1_IB_CFG_OVERRUN_THRESH",
-       "HFI1_IB_CFG_PHYERR_THRESH",
-       "HFI1_IB_CFG_LINKDEFAULT",
-       "HFI1_IB_CFG_PKEYS",
-       "HFI1_IB_CFG_MTU",
-       "HFI1_IB_CFG_LSTATE",
-       "HFI1_IB_CFG_VL_HIGH_LIMIT",
-       "HFI1_IB_CFG_PMA_TICKS",
-       "HFI1_IB_CFG_PORT"
-};
-
-static const char *ib_cfg_name(int which)
-{
-       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
-               return "invalid";
-       return ib_cfg_name_strings[which];
-}
-
-int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int val = 0;
-
-       switch (which) {
-       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
-               val = ppd->link_width_enabled;
-               break;
-       case HFI1_IB_CFG_LWID: /* currently active Link-width */
-               val = ppd->link_width_active;
-               break;
-       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
-               val = ppd->link_speed_enabled;
-               break;
-       case HFI1_IB_CFG_SPD: /* current Link speed */
-               val = ppd->link_speed_active;
-               break;
-
-       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
-       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
-       case HFI1_IB_CFG_LINKLATENCY:
-               goto unimplemented;
-
-       case HFI1_IB_CFG_OP_VLS:
-               val = ppd->vls_operational;
-               break;
-       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
-               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
-               break;
-       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
-               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
-               break;
-       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
-               val = ppd->overrun_threshold;
-               break;
-       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
-               val = ppd->phy_error_threshold;
-               break;
-       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
-               val = dd->link_default;
-               break;
-
-       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
-       case HFI1_IB_CFG_PMA_TICKS:
-       default:
-unimplemented:
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(
-                               dd,
-                               "%s: which %s: not implemented\n",
-                               __func__,
-                               ib_cfg_name(which));
-               break;
-       }
-
-       return val;
-}
-
-/*
- * The largest MAD packet size.
- */
-#define MAX_MAD_PACKET 2048
-
-/*
- * Return the maximum header bytes that can go on the _wire_
- * for this device. This count includes the ICRC which is
- * not part of the packet held in memory but it is appended
- * by the HW.
- * This is dependent on the device's receive header entry size.
- * HFI allows this to be set per-receive context, but the
- * driver presently enforces a global value.
- */
-u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
-{
-       /*
-        * The maximum non-payload (MTU) bytes in LRH.PktLen are
-        * the Receive Header Entry Size minus the PBC (or RHF) size
-        * plus one DW for the ICRC appended by HW.
-        *
-        * dd->rcd[0].rcvhdrqentsize is in DW.
-        * We use rcd[0] as all context will have the same value. Also,
-        * the first kernel context would have been allocated by now so
-        * we are guaranteed a valid value.
-        */
-       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
-}
-
-/*
- * Set Send Length
- * @ppd - per port data
- *
- * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
- * registers compare against LRH.PktLen, so use the max bytes included
- * in the LRH.
- *
- * This routine changes all VL values except VL15, which it maintains at
- * the same value.
- */
-static void set_send_length(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
-       u32 maxvlmtu = dd->vld[15].mtu;
-       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
-                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
-               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
-       int i;
-       u32 thres;
-
-       for (i = 0; i < ppd->vls_supported; i++) {
-               if (dd->vld[i].mtu > maxvlmtu)
-                       maxvlmtu = dd->vld[i].mtu;
-               if (i <= 3)
-                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
-                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
-                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
-               else
-                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
-                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
-                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
-       }
-       write_csr(dd, SEND_LEN_CHECK0, len1);
-       write_csr(dd, SEND_LEN_CHECK1, len2);
-       /* adjust kernel credit return thresholds based on new MTUs */
-       /* all kernel receive contexts have the same hdrqentsize */
-       for (i = 0; i < ppd->vls_supported; i++) {
-               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
-                           sc_mtu_to_threshold(dd->vld[i].sc,
-                                               dd->vld[i].mtu,
-                                               dd->rcd[0]->rcvhdrqentsize));
-               sc_set_cr_threshold(dd->vld[i].sc, thres);
-       }
-       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
-                   sc_mtu_to_threshold(dd->vld[15].sc,
-                                       dd->vld[15].mtu,
-                                       dd->rcd[0]->rcvhdrqentsize));
-       sc_set_cr_threshold(dd->vld[15].sc, thres);
-
-       /* Adjust maximum MTU for the port in DC */
-       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
-               (ilog2(maxvlmtu >> 8) + 1);
-       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
-       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
-       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
-               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
-       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
-}
-
-static void set_lidlmc(struct hfi1_pportdata *ppd)
-{
-       int i;
-       u64 sreg = 0;
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 mask = ~((1U << ppd->lmc) - 1);
-       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
-
-       if (dd->hfi1_snoop.mode_flag)
-               dd_dev_info(dd, "Set lid/lmc while snooping");
-
-       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
-               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
-       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
-                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
-             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
-                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
-       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
-
-       /*
-        * Iterate over all the send contexts and set their SLID check
-        */
-       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
-                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
-              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
-                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
-
-       for (i = 0; i < dd->chip_send_contexts; i++) {
-               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
-                         i, (u32)sreg);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
-       }
-
-       /* Now we have to do the same thing for the sdma engines */
-       sdma_update_lmc(dd, mask, ppd->lid);
-}
-
-static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
-{
-       unsigned long timeout;
-       u32 curr_state;
-
-       timeout = jiffies + msecs_to_jiffies(msecs);
-       while (1) {
-               curr_state = read_physical_state(dd);
-               if (curr_state == state)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
-                                  state, curr_state);
-                       return -ETIMEDOUT;
-               }
-               usleep_range(1950, 2050); /* sleep 2ms-ish */
-       }
-
-       return 0;
-}
-
-/*
- * Helper for set_link_state().  Do not call except from that routine.
- * Expects ppd->hls_mutex to be held.
- *
- * @rem_reason value to be sent to the neighbor
- *
- * LinkDownReasons only set if transition succeeds.
- */
-static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 pstate, previous_state;
-       u32 last_local_state;
-       u32 last_remote_state;
-       int ret;
-       int do_transition;
-       int do_wait;
-
-       previous_state = ppd->host_link_state;
-       ppd->host_link_state = HLS_GOING_OFFLINE;
-       pstate = read_physical_state(dd);
-       if (pstate == PLS_OFFLINE) {
-               do_transition = 0;      /* in right state */
-               do_wait = 0;            /* ...no need to wait */
-       } else if ((pstate & 0xff) == PLS_OFFLINE) {
-               do_transition = 0;      /* in an offline transient state */
-               do_wait = 1;            /* ...wait for it to settle */
-       } else {
-               do_transition = 1;      /* need to move to offline */
-               do_wait = 1;            /* ...will need to wait */
-       }
-
-       if (do_transition) {
-               ret = set_physical_link_state(dd,
-                                             (rem_reason << 8) | PLS_OFFLINE);
-
-               if (ret != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to Offline link state, return %d\n",
-                                  ret);
-                       return -EINVAL;
-               }
-               if (ppd->offline_disabled_reason ==
-                               HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
-                       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
-       }
-
-       if (do_wait) {
-               /* it can take a while for the link to go down */
-               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
-               if (ret < 0)
-                       return ret;
-       }
-
-       /* make sure the logical state is also down */
-       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
-
-       /*
-        * Now in charge of LCB - must be after the physical state is
-        * offline.quiet and before host_link_state is changed.
-        */
-       set_host_lcb_access(dd);
-       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
-       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
-
-       if (ppd->port_type == PORT_TYPE_QSFP &&
-           ppd->qsfp_info.limiting_active &&
-           qsfp_mod_present(ppd)) {
-               int ret;
-
-               ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
-               if (ret == 0) {
-                       set_qsfp_tx(ppd, 0);
-                       release_chip_resource(dd, qsfp_resource(dd));
-               } else {
-                       /* not fatal, but should warn */
-                       dd_dev_err(dd,
-                                  "Unable to acquire lock to turn off QSFP TX\n");
-               }
-       }
-
-       /*
-        * The LNI has a mandatory wait time after the physical state
-        * moves to Offline.Quiet.  The wait time may be different
-        * depending on how the link went down.  The 8051 firmware
-        * will observe the needed wait time and only move to ready
-        * when that is completed.  The largest of the quiet timeouts
-        * is 6s, so wait that long and then at least 0.5s more for
-        * other transitions, and another 0.5s for a buffer.
-        */
-       ret = wait_fm_ready(dd, 7000);
-       if (ret) {
-               dd_dev_err(dd,
-                          "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
-               /* state is really offline, so make it so */
-               ppd->host_link_state = HLS_DN_OFFLINE;
-               return ret;
-       }
-
-       /*
-        * The state is now offline and the 8051 is ready to accept host
-        * requests.
-        *      - change our state
-        *      - notify others if we were previously in a linkup state
-        */
-       ppd->host_link_state = HLS_DN_OFFLINE;
-       if (previous_state & HLS_UP) {
-               /* went down while link was up */
-               handle_linkup_change(dd, 0);
-       } else if (previous_state
-                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
-               /* went down while attempting link up */
-               /* byte 1 of last_*_state is the failure reason */
-               read_last_local_state(dd, &last_local_state);
-               read_last_remote_state(dd, &last_remote_state);
-               dd_dev_err(dd,
-                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
-                          last_local_state, last_remote_state);
-       }
-
-       /* the active link width (downgrade) is 0 on link down */
-       ppd->link_width_active = 0;
-       ppd->link_width_downgrade_tx_active = 0;
-       ppd->link_width_downgrade_rx_active = 0;
-       ppd->current_egress_rate = 0;
-       return 0;
-}
-
-/* return the link state name */
-static const char *link_state_name(u32 state)
-{
-       const char *name;
-       int n = ilog2(state);
-       static const char * const names[] = {
-               [__HLS_UP_INIT_BP]       = "INIT",
-               [__HLS_UP_ARMED_BP]      = "ARMED",
-               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
-               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
-               [__HLS_DN_POLL_BP]       = "POLL",
-               [__HLS_DN_DISABLE_BP]    = "DISABLE",
-               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
-               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
-               [__HLS_GOING_UP_BP]      = "GOING_UP",
-               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
-               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
-       };
-
-       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
-       return name ? name : "unknown";
-}
-
-/* return the link state reason name */
-static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
-{
-       if (state == HLS_UP_INIT) {
-               switch (ppd->linkinit_reason) {
-               case OPA_LINKINIT_REASON_LINKUP:
-                       return "(LINKUP)";
-               case OPA_LINKINIT_REASON_FLAPPING:
-                       return "(FLAPPING)";
-               case OPA_LINKINIT_OUTSIDE_POLICY:
-                       return "(OUTSIDE_POLICY)";
-               case OPA_LINKINIT_QUARANTINED:
-                       return "(QUARANTINED)";
-               case OPA_LINKINIT_INSUFIC_CAPABILITY:
-                       return "(INSUFIC_CAPABILITY)";
-               default:
-                       break;
-               }
-       }
-       return "";
-}
-
-/*
- * driver_physical_state - convert the driver's notion of a port's
- * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
- * Return -1 (converted to a u32) to indicate error.
- */
-u32 driver_physical_state(struct hfi1_pportdata *ppd)
-{
-       switch (ppd->host_link_state) {
-       case HLS_UP_INIT:
-       case HLS_UP_ARMED:
-       case HLS_UP_ACTIVE:
-               return IB_PORTPHYSSTATE_LINKUP;
-       case HLS_DN_POLL:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_DN_DISABLE:
-               return IB_PORTPHYSSTATE_DISABLED;
-       case HLS_DN_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_VERIFY_CAP:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_GOING_UP:
-               return IB_PORTPHYSSTATE_POLLING;
-       case HLS_GOING_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_LINK_COOLDOWN:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case HLS_DN_DOWNDEF:
-       default:
-               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
-                          ppd->host_link_state);
-               return  -1;
-       }
-}
-
-/*
- * driver_logical_state - convert the driver's notion of a port's
- * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
- * (converted to a u32) to indicate error.
- */
-u32 driver_logical_state(struct hfi1_pportdata *ppd)
-{
-       if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
-               return IB_PORT_DOWN;
-
-       switch (ppd->host_link_state & HLS_UP) {
-       case HLS_UP_INIT:
-               return IB_PORT_INIT;
-       case HLS_UP_ARMED:
-               return IB_PORT_ARMED;
-       case HLS_UP_ACTIVE:
-               return IB_PORT_ACTIVE;
-       default:
-               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
-                          ppd->host_link_state);
-       return -1;
-       }
-}
-
-void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
-                         u8 neigh_reason, u8 rem_reason)
-{
-       if (ppd->local_link_down_reason.latest == 0 &&
-           ppd->neigh_link_down_reason.latest == 0) {
-               ppd->local_link_down_reason.latest = lcl_reason;
-               ppd->neigh_link_down_reason.latest = neigh_reason;
-               ppd->remote_link_down_reason = rem_reason;
-       }
-}
-
-/*
- * Change the physical and/or logical link state.
- *
- * Do not call this routine while inside an interrupt.  It contains
- * calls to routines that can take multiple seconds to finish.
- *
- * Returns 0 on success, -errno on failure.
- */
-int set_link_state(struct hfi1_pportdata *ppd, u32 state)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct ib_event event = {.device = NULL};
-       int ret1, ret = 0;
-       int orig_new_state, poll_bounce;
-
-       mutex_lock(&ppd->hls_lock);
-
-       orig_new_state = state;
-       if (state == HLS_DN_DOWNDEF)
-               state = dd->link_default;
-
-       /* interpret poll -> poll as a link bounce */
-       poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
-                     state == HLS_DN_POLL;
-
-       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
-                   link_state_name(ppd->host_link_state),
-                   link_state_name(orig_new_state),
-                   poll_bounce ? "(bounce) " : "",
-                   link_state_reason_name(ppd, state));
-
-       /*
-        * If we're going to a (HLS_*) link state that implies the logical
-        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
-        * reset is_sm_config_started to 0.
-        */
-       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
-               ppd->is_sm_config_started = 0;
-
-       /*
-        * Do nothing if the states match.  Let a poll to poll link bounce
-        * go through.
-        */
-       if (ppd->host_link_state == state && !poll_bounce)
-               goto done;
-
-       switch (state) {
-       case HLS_UP_INIT:
-               if (ppd->host_link_state == HLS_DN_POLL &&
-                   (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
-                       /*
-                        * Quick link up jumps from polling to here.
-                        *
-                        * Whether in normal or loopback mode, the
-                        * simulator jumps from polling to link up.
-                        * Accept that here.
-                        */
-                       /* OK */
-               } else if (ppd->host_link_state != HLS_GOING_UP) {
-                       goto unexpected;
-               }
-
-               ppd->host_link_state = HLS_UP_INIT;
-               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at going_up */
-                       ppd->host_link_state = HLS_GOING_UP;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to INIT\n",
-                                  __func__);
-               } else {
-                       /* clear old transient LINKINIT_REASON code */
-                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
-                               ppd->linkinit_reason =
-                                       OPA_LINKINIT_REASON_LINKUP;
-
-                       /* enable the port */
-                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
-
-                       handle_linkup_change(dd, 1);
-               }
-               break;
-       case HLS_UP_ARMED:
-               if (ppd->host_link_state != HLS_UP_INIT)
-                       goto unexpected;
-
-               ppd->host_link_state = HLS_UP_ARMED;
-               set_logical_state(dd, LSTATE_ARMED);
-               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at init */
-                       ppd->host_link_state = HLS_UP_INIT;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to ARMED\n",
-                                  __func__);
-               }
-               /*
-                * The simulator does not currently implement SMA messages,
-                * so neighbor_normal is not set.  Set it here when we first
-                * move to Armed.
-                */
-               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-                       ppd->neighbor_normal = 1;
-               break;
-       case HLS_UP_ACTIVE:
-               if (ppd->host_link_state != HLS_UP_ARMED)
-                       goto unexpected;
-
-               ppd->host_link_state = HLS_UP_ACTIVE;
-               set_logical_state(dd, LSTATE_ACTIVE);
-               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
-               if (ret) {
-                       /* logical state didn't change, stay at armed */
-                       ppd->host_link_state = HLS_UP_ARMED;
-                       dd_dev_err(dd,
-                                  "%s: logical state did not change to ACTIVE\n",
-                                  __func__);
-               } else {
-                       /* tell all engines to go running */
-                       sdma_all_running(dd);
-
-                       /* Signal the IB layer that the port has went active */
-                       event.device = &dd->verbs_dev.rdi.ibdev;
-                       event.element.port_num = ppd->port;
-                       event.event = IB_EVENT_PORT_ACTIVE;
-               }
-               break;
-       case HLS_DN_POLL:
-               if ((ppd->host_link_state == HLS_DN_DISABLE ||
-                    ppd->host_link_state == HLS_DN_OFFLINE) &&
-                   dd->dc_shutdown)
-                       dc_start(dd);
-               /* Hand LED control to the DC */
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
-
-               if (ppd->host_link_state != HLS_DN_OFFLINE) {
-                       u8 tmp = ppd->link_enabled;
-
-                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
-                       if (ret) {
-                               ppd->link_enabled = tmp;
-                               break;
-                       }
-                       ppd->remote_link_down_reason = 0;
-
-                       if (ppd->driver_link_ready)
-                               ppd->link_enabled = 1;
-               }
-
-               set_all_slowpath(ppd->dd);
-               ret = set_local_link_attributes(ppd);
-               if (ret)
-                       break;
-
-               ppd->port_error_action = 0;
-               ppd->host_link_state = HLS_DN_POLL;
-
-               if (quick_linkup) {
-                       /* quick linkup does not go into polling */
-                       ret = do_quick_linkup(dd);
-               } else {
-                       ret1 = set_physical_link_state(dd, PLS_POLLING);
-                       if (ret1 != HCMD_SUCCESS) {
-                               dd_dev_err(dd,
-                                          "Failed to transition to Polling link state, return 0x%x\n",
-                                          ret1);
-                               ret = -EINVAL;
-                       }
-               }
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-               /*
-                * If an error occurred above, go back to offline.  The
-                * caller may reschedule another attempt.
-                */
-               if (ret)
-                       goto_offline(ppd, 0);
-               break;
-       case HLS_DN_DISABLE:
-               /* link is disabled */
-               ppd->link_enabled = 0;
-
-               /* allow any state to transition to disabled */
-
-               /* must transition to offline first */
-               if (ppd->host_link_state != HLS_DN_OFFLINE) {
-                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
-                       if (ret)
-                               break;
-                       ppd->remote_link_down_reason = 0;
-               }
-
-               ret1 = set_physical_link_state(dd, PLS_DISABLED);
-               if (ret1 != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to Disabled link state, return 0x%x\n",
-                                  ret1);
-                       ret = -EINVAL;
-                       break;
-               }
-               ppd->host_link_state = HLS_DN_DISABLE;
-               dc_shutdown(dd);
-               break;
-       case HLS_DN_OFFLINE:
-               if (ppd->host_link_state == HLS_DN_DISABLE)
-                       dc_start(dd);
-
-               /* allow any state to transition to offline */
-               ret = goto_offline(ppd, ppd->remote_link_down_reason);
-               if (!ret)
-                       ppd->remote_link_down_reason = 0;
-               break;
-       case HLS_VERIFY_CAP:
-               if (ppd->host_link_state != HLS_DN_POLL)
-                       goto unexpected;
-               ppd->host_link_state = HLS_VERIFY_CAP;
-               break;
-       case HLS_GOING_UP:
-               if (ppd->host_link_state != HLS_VERIFY_CAP)
-                       goto unexpected;
-
-               ret1 = set_physical_link_state(dd, PLS_LINKUP);
-               if (ret1 != HCMD_SUCCESS) {
-                       dd_dev_err(dd,
-                                  "Failed to transition to link up state, return 0x%x\n",
-                                  ret1);
-                       ret = -EINVAL;
-                       break;
-               }
-               ppd->host_link_state = HLS_GOING_UP;
-               break;
-
-       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
-       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
-       default:
-               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
-                           __func__, state);
-               ret = -EINVAL;
-               break;
-       }
-
-       goto done;
-
-unexpected:
-       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
-                  __func__, link_state_name(ppd->host_link_state),
-                  link_state_name(state));
-       ret = -EINVAL;
-
-done:
-       mutex_unlock(&ppd->hls_lock);
-
-       if (event.device)
-               ib_dispatch_event(&event);
-
-       return ret;
-}
-
-int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
-{
-       u64 reg;
-       int ret = 0;
-
-       switch (which) {
-       case HFI1_IB_CFG_LIDLMC:
-               set_lidlmc(ppd);
-               break;
-       case HFI1_IB_CFG_VL_HIGH_LIMIT:
-               /*
-                * The VL Arbitrator high limit is sent in units of 4k
-                * bytes, while HFI stores it in units of 64 bytes.
-                */
-               val *= 4096 / 64;
-               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
-                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
-               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
-               break;
-       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
-               /* HFI only supports POLL as the default link down state */
-               if (val != HLS_DN_POLL)
-                       ret = -EINVAL;
-               break;
-       case HFI1_IB_CFG_OP_VLS:
-               if (ppd->vls_operational != val) {
-                       ppd->vls_operational = val;
-                       if (!ppd->port)
-                               ret = -EINVAL;
-               }
-               break;
-       /*
-        * For link width, link width downgrade, and speed enable, always AND
-        * the setting with what is actually supported.  This has two benefits.
-        * First, enabled can't have unsupported values, no matter what the
-        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
-        * "fill in with your supported value" have all the bits in the
-        * field set, so simply ANDing with supported has the desired result.
-        */
-       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
-               ppd->link_width_enabled = val & ppd->link_width_supported;
-               break;
-       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
-               ppd->link_width_downgrade_enabled =
-                               val & ppd->link_width_downgrade_supported;
-               break;
-       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
-               ppd->link_speed_enabled = val & ppd->link_speed_supported;
-               break;
-       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
-               /*
-                * HFI does not follow IB specs, save this value
-                * so we can report it, if asked.
-                */
-               ppd->overrun_threshold = val;
-               break;
-       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
-               /*
-                * HFI does not follow IB specs, save this value
-                * so we can report it, if asked.
-                */
-               ppd->phy_error_threshold = val;
-               break;
-
-       case HFI1_IB_CFG_MTU:
-               set_send_length(ppd);
-               break;
-
-       case HFI1_IB_CFG_PKEYS:
-               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
-                       set_partition_keys(ppd);
-               break;
-
-       default:
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(ppd->dd,
-                                   "%s: which %s, val 0x%x: not implemented\n",
-                                   __func__, ib_cfg_name(which), val);
-               break;
-       }
-       return ret;
-}
-
-/* begin functions related to vl arbitration table caching */
-static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
-{
-       int i;
-
-       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
-                       VL_ARB_LOW_PRIO_TABLE_SIZE);
-       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
-                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
-
-       /*
-        * Note that we always return values directly from the
-        * 'vl_arb_cache' (and do no CSR reads) in response to a
-        * 'Get(VLArbTable)'. This is obviously correct after a
-        * 'Set(VLArbTable)', since the cache will then be up to
-        * date. But it's also correct prior to any 'Set(VLArbTable)'
-        * since then both the cache, and the relevant h/w registers
-        * will be zeroed.
-        */
-
-       for (i = 0; i < MAX_PRIO_TABLE; i++)
-               spin_lock_init(&ppd->vl_arb_cache[i].lock);
-}
-
-/*
- * vl_arb_lock_cache
- *
- * All other vl_arb_* functions should be called only after locking
- * the cache.
- */
-static inline struct vl_arb_cache *
-vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
-{
-       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
-               return NULL;
-       spin_lock(&ppd->vl_arb_cache[idx].lock);
-       return &ppd->vl_arb_cache[idx];
-}
-
-static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
-{
-       spin_unlock(&ppd->vl_arb_cache[idx].lock);
-}
-
-static void vl_arb_get_cache(struct vl_arb_cache *cache,
-                            struct ib_vl_weight_elem *vl)
-{
-       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-static void vl_arb_set_cache(struct vl_arb_cache *cache,
-                            struct ib_vl_weight_elem *vl)
-{
-       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-static int vl_arb_match_cache(struct vl_arb_cache *cache,
-                             struct ib_vl_weight_elem *vl)
-{
-       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
-}
-
-/* end functions related to vl arbitration table caching */
-
-static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
-                         u32 size, struct ib_vl_weight_elem *vl)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-       unsigned int i, is_up = 0;
-       int drain, ret = 0;
-
-       mutex_lock(&ppd->hls_lock);
-
-       if (ppd->host_link_state & HLS_UP)
-               is_up = 1;
-
-       drain = !is_ax(dd) && is_up;
-
-       if (drain)
-               /*
-                * Before adjusting VL arbitration weights, empty per-VL
-                * FIFOs, otherwise a packet whose VL weight is being
-                * set to 0 could get stuck in a FIFO with no chance to
-                * egress.
-                */
-               ret = stop_drain_data_vls(dd);
-
-       if (ret) {
-               dd_dev_err(
-                       dd,
-                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
-                       __func__);
-               goto err;
-       }
-
-       for (i = 0; i < size; i++, vl++) {
-               /*
-                * NOTE: The low priority shift and mask are used here, but
-                * they are the same for both the low and high registers.
-                */
-               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
-                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
-                     | (((u64)vl->weight
-                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
-                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
-               write_csr(dd, target + (i * 8), reg);
-       }
-       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
-
-       if (drain)
-               open_fill_data_vls(dd); /* reopen all VLs */
-
-err:
-       mutex_unlock(&ppd->hls_lock);
-
-       return ret;
-}
-
-/*
- * Read one credit merge VL register.
- */
-static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
-                          struct vl_limit *vll)
-{
-       u64 reg = read_csr(dd, csr);
-
-       vll->dedicated = cpu_to_be16(
-               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
-               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
-       vll->shared = cpu_to_be16(
-               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
-               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
-}
-
-/*
- * Read the current credit merge limits.
- */
-static int get_buffer_control(struct hfi1_devdata *dd,
-                             struct buffer_control *bc, u16 *overall_limit)
-{
-       u64 reg;
-       int i;
-
-       /* not all entries are filled in */
-       memset(bc, 0, sizeof(*bc));
-
-       /* OPA and HFI have a 1-1 mapping */
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
-
-       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
-       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       bc->overall_shared_limit = cpu_to_be16(
-               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
-               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
-       if (overall_limit)
-               *overall_limit = (reg
-                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
-                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
-       return sizeof(struct buffer_control);
-}
-
-static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
-{
-       u64 reg;
-       int i;
-
-       /* each register contains 16 SC->VLnt mappings, 4 bits each */
-       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
-       for (i = 0; i < sizeof(u64); i++) {
-               u8 byte = *(((u8 *)&reg) + i);
-
-               dp->vlnt[2 * i] = byte & 0xf;
-               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
-       }
-
-       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
-       for (i = 0; i < sizeof(u64); i++) {
-               u8 byte = *(((u8 *)&reg) + i);
-
-               dp->vlnt[16 + (2 * i)] = byte & 0xf;
-               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
-       }
-       return sizeof(struct sc2vlnt);
-}
-
-static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
-                             struct ib_vl_weight_elem *vl)
-{
-       unsigned int i;
-
-       for (i = 0; i < nelems; i++, vl++) {
-               vl->vl = 0xf;
-               vl->weight = 0;
-       }
-}
-
-static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
-{
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
-                 DC_SC_VL_VAL(15_0,
-                              0, dp->vlnt[0] & 0xf,
-                              1, dp->vlnt[1] & 0xf,
-                              2, dp->vlnt[2] & 0xf,
-                              3, dp->vlnt[3] & 0xf,
-                              4, dp->vlnt[4] & 0xf,
-                              5, dp->vlnt[5] & 0xf,
-                              6, dp->vlnt[6] & 0xf,
-                              7, dp->vlnt[7] & 0xf,
-                              8, dp->vlnt[8] & 0xf,
-                              9, dp->vlnt[9] & 0xf,
-                              10, dp->vlnt[10] & 0xf,
-                              11, dp->vlnt[11] & 0xf,
-                              12, dp->vlnt[12] & 0xf,
-                              13, dp->vlnt[13] & 0xf,
-                              14, dp->vlnt[14] & 0xf,
-                              15, dp->vlnt[15] & 0xf));
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
-                 DC_SC_VL_VAL(31_16,
-                              16, dp->vlnt[16] & 0xf,
-                              17, dp->vlnt[17] & 0xf,
-                              18, dp->vlnt[18] & 0xf,
-                              19, dp->vlnt[19] & 0xf,
-                              20, dp->vlnt[20] & 0xf,
-                              21, dp->vlnt[21] & 0xf,
-                              22, dp->vlnt[22] & 0xf,
-                              23, dp->vlnt[23] & 0xf,
-                              24, dp->vlnt[24] & 0xf,
-                              25, dp->vlnt[25] & 0xf,
-                              26, dp->vlnt[26] & 0xf,
-                              27, dp->vlnt[27] & 0xf,
-                              28, dp->vlnt[28] & 0xf,
-                              29, dp->vlnt[29] & 0xf,
-                              30, dp->vlnt[30] & 0xf,
-                              31, dp->vlnt[31] & 0xf));
-}
-
-static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
-                       u16 limit)
-{
-       if (limit != 0)
-               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
-                           what, (int)limit, idx);
-}
-
-/* change only the shared limit portion of SendCmGLobalCredit */
-static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
-{
-       u64 reg;
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
-       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
-}
-
-/* change only the total credit limit portion of SendCmGLobalCredit */
-static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
-{
-       u64 reg;
-
-       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
-       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
-       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
-}
-
-/* set the given per-VL shared limit */
-static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
-{
-       u64 reg;
-       u32 addr;
-
-       if (vl < TXE_NUM_DATA_VL)
-               addr = SEND_CM_CREDIT_VL + (8 * vl);
-       else
-               addr = SEND_CM_CREDIT_VL15;
-
-       reg = read_csr(dd, addr);
-       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
-       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
-       write_csr(dd, addr, reg);
-}
-
-/* set the given per-VL dedicated limit */
-static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
-{
-       u64 reg;
-       u32 addr;
-
-       if (vl < TXE_NUM_DATA_VL)
-               addr = SEND_CM_CREDIT_VL + (8 * vl);
-       else
-               addr = SEND_CM_CREDIT_VL15;
-
-       reg = read_csr(dd, addr);
-       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
-       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
-       write_csr(dd, addr, reg);
-}
-
-/* spin until the given per-VL status mask bits clear */
-static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
-                                    const char *which)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
-
-               if (reg == 0)
-                       return; /* success */
-               if (time_after(jiffies, timeout))
-                       break;          /* timed out */
-               udelay(1);
-       }
-
-       dd_dev_err(dd,
-                  "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
-                  which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
-       /*
-        * If this occurs, it is likely there was a credit loss on the link.
-        * The only recovery from that is a link bounce.
-        */
-       dd_dev_err(dd,
-                  "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
-}
-
-/*
- * The number of credits on the VLs may be changed while everything
- * is "live", but the following algorithm must be followed due to
- * how the hardware is actually implemented.  In particular,
- * Return_Credit_Status[] is the only correct status check.
- *
- * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
- *     set Global_Shared_Credit_Limit = 0
- *     use_all_vl = 1
- * mask0 = all VLs that are changing either dedicated or shared limits
- * set Shared_Limit[mask0] = 0
- * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
- * if (changing any dedicated limit)
- *     mask1 = all VLs that are lowering dedicated limits
- *     lower Dedicated_Limit[mask1]
- *     spin until Return_Credit_Status[mask1] == 0
- *     raise Dedicated_Limits
- * raise Shared_Limits
- * raise Global_Shared_Credit_Limit
- *
- * lower = if the new limit is lower, set the limit to the new value
- * raise = if the new limit is higher than the current value (may be changed
- *     earlier in the algorithm), set the new limit to the new value
- */
-int set_buffer_control(struct hfi1_pportdata *ppd,
-                      struct buffer_control *new_bc)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 changing_mask, ld_mask, stat_mask;
-       int change_count;
-       int i, use_all_mask;
-       int this_shared_changing;
-       int vl_count = 0, ret;
-       /*
-        * A0: add the variable any_shared_limit_changing below and in the
-        * algorithm above.  If removing A0 support, it can be removed.
-        */
-       int any_shared_limit_changing;
-       struct buffer_control cur_bc;
-       u8 changing[OPA_MAX_VLS];
-       u8 lowering_dedicated[OPA_MAX_VLS];
-       u16 cur_total;
-       u32 new_total = 0;
-       const u64 all_mask =
-       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
-        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
-
-#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
-#define NUM_USABLE_VLS 16      /* look at VL15 and less */
-
-       /* find the new total credits, do sanity check on unused VLs */
-       for (i = 0; i < OPA_MAX_VLS; i++) {
-               if (valid_vl(i)) {
-                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
-                       continue;
-               }
-               nonzero_msg(dd, i, "dedicated",
-                           be16_to_cpu(new_bc->vl[i].dedicated));
-               nonzero_msg(dd, i, "shared",
-                           be16_to_cpu(new_bc->vl[i].shared));
-               new_bc->vl[i].dedicated = 0;
-               new_bc->vl[i].shared = 0;
-       }
-       new_total += be16_to_cpu(new_bc->overall_shared_limit);
-
-       /* fetch the current values */
-       get_buffer_control(dd, &cur_bc, &cur_total);
-
-       /*
-        * Create the masks we will use.
-        */
-       memset(changing, 0, sizeof(changing));
-       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
-       /*
-        * NOTE: Assumes that the individual VL bits are adjacent and in
-        * increasing order
-        */
-       stat_mask =
-               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
-       changing_mask = 0;
-       ld_mask = 0;
-       change_count = 0;
-       any_shared_limit_changing = 0;
-       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
-               if (!valid_vl(i))
-                       continue;
-               this_shared_changing = new_bc->vl[i].shared
-                                               != cur_bc.vl[i].shared;
-               if (this_shared_changing)
-                       any_shared_limit_changing = 1;
-               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
-                   this_shared_changing) {
-                       changing[i] = 1;
-                       changing_mask |= stat_mask;
-                       change_count++;
-               }
-               if (be16_to_cpu(new_bc->vl[i].dedicated) <
-                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
-                       lowering_dedicated[i] = 1;
-                       ld_mask |= stat_mask;
-               }
-       }
-
-       /* bracket the credit change with a total adjustment */
-       if (new_total > cur_total)
-               set_global_limit(dd, new_total);
-
-       /*
-        * Start the credit change algorithm.
-        */
-       use_all_mask = 0;
-       if ((be16_to_cpu(new_bc->overall_shared_limit) <
-            be16_to_cpu(cur_bc.overall_shared_limit)) ||
-           (is_ax(dd) && any_shared_limit_changing)) {
-               set_global_shared(dd, 0);
-               cur_bc.overall_shared_limit = 0;
-               use_all_mask = 1;
-       }
-
-       for (i = 0; i < NUM_USABLE_VLS; i++) {
-               if (!valid_vl(i))
-                       continue;
-
-               if (changing[i]) {
-                       set_vl_shared(dd, i, 0);
-                       cur_bc.vl[i].shared = 0;
-               }
-       }
-
-       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
-                                "shared");
-
-       if (change_count > 0) {
-               for (i = 0; i < NUM_USABLE_VLS; i++) {
-                       if (!valid_vl(i))
-                               continue;
-
-                       if (lowering_dedicated[i]) {
-                               set_vl_dedicated(dd, i,
-                                                be16_to_cpu(new_bc->
-                                                            vl[i].dedicated));
-                               cur_bc.vl[i].dedicated =
-                                               new_bc->vl[i].dedicated;
-                       }
-               }
-
-               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
-
-               /* now raise all dedicated that are going up */
-               for (i = 0; i < NUM_USABLE_VLS; i++) {
-                       if (!valid_vl(i))
-                               continue;
-
-                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
-                                       be16_to_cpu(cur_bc.vl[i].dedicated))
-                               set_vl_dedicated(dd, i,
-                                                be16_to_cpu(new_bc->
-                                                            vl[i].dedicated));
-               }
-       }
-
-       /* next raise all shared that are going up */
-       for (i = 0; i < NUM_USABLE_VLS; i++) {
-               if (!valid_vl(i))
-                       continue;
-
-               if (be16_to_cpu(new_bc->vl[i].shared) >
-                               be16_to_cpu(cur_bc.vl[i].shared))
-                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
-       }
-
-       /* finally raise the global shared */
-       if (be16_to_cpu(new_bc->overall_shared_limit) >
-           be16_to_cpu(cur_bc.overall_shared_limit))
-               set_global_shared(dd,
-                                 be16_to_cpu(new_bc->overall_shared_limit));
-
-       /* bracket the credit change with a total adjustment */
-       if (new_total < cur_total)
-               set_global_limit(dd, new_total);
-
-       /*
-        * Determine the actual number of operational VLS using the number of
-        * dedicated and shared credits for each VL.
-        */
-       if (change_count > 0) {
-               for (i = 0; i < TXE_NUM_DATA_VL; i++)
-                       if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
-                           be16_to_cpu(new_bc->vl[i].shared) > 0)
-                               vl_count++;
-               ppd->actual_vls_operational = vl_count;
-               ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
-                                   ppd->actual_vls_operational :
-                                   ppd->vls_operational,
-                                   NULL);
-               if (ret == 0)
-                       ret = pio_map_init(dd, ppd->port - 1, vl_count ?
-                                          ppd->actual_vls_operational :
-                                          ppd->vls_operational, NULL);
-               if (ret)
-                       return ret;
-       }
-       return 0;
-}
-
-/*
- * Read the given fabric manager table. Return the size of the
- * table (in bytes) on success, and a negative error code on
- * failure.
- */
-int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
-
-{
-       int size;
-       struct vl_arb_cache *vlc;
-
-       switch (which) {
-       case FM_TBL_VL_HIGH_ARB:
-               size = 256;
-               /*
-                * OPA specifies 128 elements (of 2 bytes each), though
-                * HFI supports only 16 elements in h/w.
-                */
-               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
-               vl_arb_get_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-               break;
-       case FM_TBL_VL_LOW_ARB:
-               size = 256;
-               /*
-                * OPA specifies 128 elements (of 2 bytes each), though
-                * HFI supports only 16 elements in h/w.
-                */
-               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
-               vl_arb_get_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-               break;
-       case FM_TBL_BUFFER_CONTROL:
-               size = get_buffer_control(ppd->dd, t, NULL);
-               break;
-       case FM_TBL_SC2VLNT:
-               size = get_sc2vlnt(ppd->dd, t);
-               break;
-       case FM_TBL_VL_PREEMPT_ELEMS:
-               size = 256;
-               /* OPA specifies 128 elements, of 2 bytes each */
-               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
-               break;
-       case FM_TBL_VL_PREEMPT_MATRIX:
-               size = 256;
-               /*
-                * OPA specifies that this is the same size as the VL
-                * arbitration tables (i.e., 256 bytes).
-                */
-               break;
-       default:
-               return -EINVAL;
-       }
-       return size;
-}
-
-/*
- * Write the given fabric manager table.
- */
-int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
-{
-       int ret = 0;
-       struct vl_arb_cache *vlc;
-
-       switch (which) {
-       case FM_TBL_VL_HIGH_ARB:
-               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
-               if (vl_arb_match_cache(vlc, t)) {
-                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-                       break;
-               }
-               vl_arb_set_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
-               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
-                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
-               break;
-       case FM_TBL_VL_LOW_ARB:
-               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
-               if (vl_arb_match_cache(vlc, t)) {
-                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-                       break;
-               }
-               vl_arb_set_cache(vlc, t);
-               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
-               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
-                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
-               break;
-       case FM_TBL_BUFFER_CONTROL:
-               ret = set_buffer_control(ppd, t);
-               break;
-       case FM_TBL_SC2VLNT:
-               set_sc2vlnt(ppd->dd, t);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-       return ret;
-}
-
-/*
- * Disable all data VLs.
- *
- * Return 0 if disabled, non-zero if the VLs cannot be disabled.
- */
-static int disable_data_vls(struct hfi1_devdata *dd)
-{
-       if (is_ax(dd))
-               return 1;
-
-       pio_send_control(dd, PSC_DATA_VL_DISABLE);
-
-       return 0;
-}
-
-/*
- * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
- * Just re-enables all data VLs (the "fill" part happens
- * automatically - the name was chosen for symmetry with
- * stop_drain_data_vls()).
- *
- * Return 0 if successful, non-zero if the VLs cannot be enabled.
- */
-int open_fill_data_vls(struct hfi1_devdata *dd)
-{
-       if (is_ax(dd))
-               return 1;
-
-       pio_send_control(dd, PSC_DATA_VL_ENABLE);
-
-       return 0;
-}
-
-/*
- * drain_data_vls() - assumes that disable_data_vls() has been called,
- * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
- * engines to drop to 0.
- */
-static void drain_data_vls(struct hfi1_devdata *dd)
-{
-       sc_wait(dd);
-       sdma_wait(dd);
-       pause_for_credit_return(dd);
-}
-
-/*
- * stop_drain_data_vls() - disable, then drain all per-VL fifos.
- *
- * Use open_fill_data_vls() to resume using data VLs.  This pair is
- * meant to be used like this:
- *
- * stop_drain_data_vls(dd);
- * // do things with per-VL resources
- * open_fill_data_vls(dd);
- */
-int stop_drain_data_vls(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       ret = disable_data_vls(dd);
-       if (ret == 0)
-               drain_data_vls(dd);
-
-       return ret;
-}
-
-/*
- * Convert a nanosecond time to a cclock count.  No matter how slow
- * the cclock, a non-zero ns will always have a non-zero result.
- */
-u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
-{
-       u32 cclocks;
-
-       if (dd->icode == ICODE_FPGA_EMULATION)
-               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
-       else  /* simulation pretends to be ASIC */
-               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
-       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
-               cclocks = 1;
-       return cclocks;
-}
-
-/*
- * Convert a cclock count to nanoseconds. Not matter how slow
- * the cclock, a non-zero cclocks will always have a non-zero result.
- */
-u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
-{
-       u32 ns;
-
-       if (dd->icode == ICODE_FPGA_EMULATION)
-               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
-       else  /* simulation pretends to be ASIC */
-               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
-       if (cclocks && !ns)
-               ns = 1;
-       return ns;
-}
-
-/*
- * Dynamically adjust the receive interrupt timeout for a context based on
- * incoming packet rate.
- *
- * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
- */
-static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 timeout = rcd->rcvavail_timeout;
-
-       /*
-        * This algorithm doubles or halves the timeout depending on whether
-        * the number of packets received in this interrupt were less than or
-        * greater equal the interrupt count.
-        *
-        * The calculations below do not allow a steady state to be achieved.
-        * Only at the endpoints it is possible to have an unchanging
-        * timeout.
-        */
-       if (npkts < rcv_intr_count) {
-               /*
-                * Not enough packets arrived before the timeout, adjust
-                * timeout downward.
-                */
-               if (timeout < 2) /* already at minimum? */
-                       return;
-               timeout >>= 1;
-       } else {
-               /*
-                * More than enough packets arrived before the timeout, adjust
-                * timeout upward.
-                */
-               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
-                       return;
-               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
-       }
-
-       rcd->rcvavail_timeout = timeout;
-       /*
-        * timeout cannot be larger than rcv_intr_timeout_csr which has already
-        * been verified to be in range
-        */
-       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
-                       (u64)timeout <<
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
-}
-
-void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
-                   u32 intr_adjust, u32 npkts)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u64 reg;
-       u32 ctxt = rcd->ctxt;
-
-       /*
-        * Need to write timeout register before updating RcvHdrHead to ensure
-        * that a new value is used when the HW decides to restart counting.
-        */
-       if (intr_adjust)
-               adjust_rcv_timeout(rcd, npkts);
-       if (updegr) {
-               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
-                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
-               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
-       }
-       mmiowb();
-       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
-               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
-                       << RCV_HDR_HEAD_HEAD_SHIFT);
-       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-       mmiowb();
-}
-
-u32 hdrqempty(struct hfi1_ctxtdata *rcd)
-{
-       u32 head, tail;
-
-       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
-               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
-
-       if (rcd->rcvhdrtail_kvaddr)
-               tail = get_rcvhdrtail(rcd);
-       else
-               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
-
-       return head == tail;
-}
-
-/*
- * Context Control and Receive Array encoding for buffer size:
- *     0x0 invalid
- *     0x1   4 KB
- *     0x2   8 KB
- *     0x3  16 KB
- *     0x4  32 KB
- *     0x5  64 KB
- *     0x6 128 KB
- *     0x7 256 KB
- *     0x8 512 KB (Receive Array only)
- *     0x9   1 MB (Receive Array only)
- *     0xa   2 MB (Receive Array only)
- *
- *     0xB-0xF - reserved (Receive Array only)
- *
- *
- * This routine assumes that the value has already been sanity checked.
- */
-static u32 encoded_size(u32 size)
-{
-       switch (size) {
-       case   4 * 1024: return 0x1;
-       case   8 * 1024: return 0x2;
-       case  16 * 1024: return 0x3;
-       case  32 * 1024: return 0x4;
-       case  64 * 1024: return 0x5;
-       case 128 * 1024: return 0x6;
-       case 256 * 1024: return 0x7;
-       case 512 * 1024: return 0x8;
-       case   1 * 1024 * 1024: return 0x9;
-       case   2 * 1024 * 1024: return 0xa;
-       }
-       return 0x1;     /* if invalid, go with the minimum size */
-}
-
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
-{
-       struct hfi1_ctxtdata *rcd;
-       u64 rcvctrl, reg;
-       int did_enable = 0;
-
-       rcd = dd->rcd[ctxt];
-       if (!rcd)
-               return;
-
-       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
-
-       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
-       /* if the context already enabled, don't do the extra steps */
-       if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
-           !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
-               /* reset the tail and hdr addresses, and sequence count */
-               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
-                               rcd->rcvhdrq_phys);
-               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
-                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                                       rcd->rcvhdrqtailaddr_phys);
-               rcd->seq_cnt = 1;
-
-               /* reset the cached receive header queue head value */
-               rcd->head = 0;
-
-               /*
-                * Zero the receive header queue so we don't get false
-                * positives when checking the sequence number.  The
-                * sequence numbers could land exactly on the same spot.
-                * E.g. a rcd restart before the receive header wrapped.
-                */
-               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
-
-               /* starting timeout */
-               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
-
-               /* enable the context */
-               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
-
-               /* clean the egr buffer size first */
-               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
-               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
-                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
-                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
-
-               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
-               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
-               did_enable = 1;
-
-               /* zero RcvEgrIndexHead */
-               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
-
-               /* set eager count and base index */
-               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
-                       & RCV_EGR_CTRL_EGR_CNT_MASK)
-                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
-                       (((rcd->eager_base >> RCV_SHIFT)
-                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
-                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
-               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
-
-               /*
-                * Set TID (expected) count and base index.
-                * rcd->expected_count is set to individual RcvArray entries,
-                * not pairs, and the CSR takes a pair-count in groups of
-                * four, so divide by 8.
-                */
-               reg = (((rcd->expected_count >> RCV_SHIFT)
-                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
-                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
-                     (((rcd->expected_base >> RCV_SHIFT)
-                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
-                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
-               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
-               if (ctxt == HFI1_CTRL_CTXT)
-                       write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
-       }
-       if (op & HFI1_RCVCTRL_CTXT_DIS) {
-               write_csr(dd, RCV_VL15, 0);
-               /*
-                * When receive context is being disabled turn on tail
-                * update with a dummy tail address and then disable
-                * receive context.
-                */
-               if (dd->rcvhdrtail_dummy_physaddr) {
-                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                                       dd->rcvhdrtail_dummy_physaddr);
-                       /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
-                       rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-               }
-
-               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
-               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-       if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
-               /* See comment on RcvCtxtCtrl.TailUpd above */
-               if (!(op & HFI1_RCVCTRL_CTXT_DIS))
-                       rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
-       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
-       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
-               /*
-                * In one-packet-per-eager mode, the size comes from
-                * the RcvArray entry.
-                */
-               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
-               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
-       }
-       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
-       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
-               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
-       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
-               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
-       rcd->rcvctrl = rcvctrl;
-       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
-       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
-
-       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
-       if (did_enable &&
-           (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
-               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
-               if (reg != 0) {
-                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
-                                   ctxt, reg);
-                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
-                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
-                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
-                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
-                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
-                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
-                                   ctxt, reg, reg == 0 ? "not" : "still");
-               }
-       }
-
-       if (did_enable) {
-               /*
-                * The interrupt timeout and count must be set after
-                * the context is enabled to take effect.
-                */
-               /* set interrupt timeout */
-               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
-                               (u64)rcd->rcvavail_timeout <<
-                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
-
-               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
-               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
-               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-       }
-
-       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
-               /*
-                * If the context has been disabled and the Tail Update has
-                * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
-                * so it doesn't contain an address that is invalid.
-                */
-               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
-                               dd->rcvhdrtail_dummy_physaddr);
-}
-
-u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
-{
-       int ret;
-       u64 val = 0;
-
-       if (namep) {
-               ret = dd->cntrnameslen;
-               *namep = dd->cntrnames;
-       } else {
-               const struct cntr_entry *entry;
-               int i, j;
-
-               ret = (dd->ndevcntrs) * sizeof(u64);
-
-               /* Get the start of the block of counters */
-               *cntrp = dd->cntrs;
-
-               /*
-                * Now go and fill in each counter in the block.
-                */
-               for (i = 0; i < DEV_CNTR_LAST; i++) {
-                       entry = &dev_cntrs[i];
-                       hfi1_cdbg(CNTR, "reading %s", entry->name);
-                       if (entry->flags & CNTR_DISABLED) {
-                               /* Nothing */
-                               hfi1_cdbg(CNTR, "\tDisabled\n");
-                       } else {
-                               if (entry->flags & CNTR_VL) {
-                                       hfi1_cdbg(CNTR, "\tPer VL\n");
-                                       for (j = 0; j < C_VL_COUNT; j++) {
-                                               val = entry->rw_cntr(entry,
-                                                                 dd, j,
-                                                                 CNTR_MODE_R,
-                                                                 0);
-                                               hfi1_cdbg(
-                                                  CNTR,
-                                                  "\t\tRead 0x%llx for %d\n",
-                                                  val, j);
-                                               dd->cntrs[entry->offset + j] =
-                                                                           val;
-                                       }
-                               } else if (entry->flags & CNTR_SDMA) {
-                                       hfi1_cdbg(CNTR,
-                                                 "\t Per SDMA Engine\n");
-                                       for (j = 0; j < dd->chip_sdma_engines;
-                                            j++) {
-                                               val =
-                                               entry->rw_cntr(entry, dd, j,
-                                                              CNTR_MODE_R, 0);
-                                               hfi1_cdbg(CNTR,
-                                                         "\t\tRead 0x%llx for %d\n",
-                                                         val, j);
-                                               dd->cntrs[entry->offset + j] =
-                                                                       val;
-                                       }
-                               } else {
-                                       val = entry->rw_cntr(entry, dd,
-                                                       CNTR_INVALID_VL,
-                                                       CNTR_MODE_R, 0);
-                                       dd->cntrs[entry->offset] = val;
-                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
-                               }
-                       }
-               }
-       }
-       return ret;
-}
-
-/*
- * Used by sysfs to create files for hfi stats to read
- */
-u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
-{
-       int ret;
-       u64 val = 0;
-
-       if (namep) {
-               ret = ppd->dd->portcntrnameslen;
-               *namep = ppd->dd->portcntrnames;
-       } else {
-               const struct cntr_entry *entry;
-               int i, j;
-
-               ret = ppd->dd->nportcntrs * sizeof(u64);
-               *cntrp = ppd->cntrs;
-
-               for (i = 0; i < PORT_CNTR_LAST; i++) {
-                       entry = &port_cntrs[i];
-                       hfi1_cdbg(CNTR, "reading %s", entry->name);
-                       if (entry->flags & CNTR_DISABLED) {
-                               /* Nothing */
-                               hfi1_cdbg(CNTR, "\tDisabled\n");
-                               continue;
-                       }
-
-                       if (entry->flags & CNTR_VL) {
-                               hfi1_cdbg(CNTR, "\tPer VL");
-                               for (j = 0; j < C_VL_COUNT; j++) {
-                                       val = entry->rw_cntr(entry, ppd, j,
-                                                              CNTR_MODE_R,
-                                                              0);
-                                       hfi1_cdbg(
-                                          CNTR,
-                                          "\t\tRead 0x%llx for %d",
-                                          val, j);
-                                       ppd->cntrs[entry->offset + j] = val;
-                               }
-                       } else {
-                               val = entry->rw_cntr(entry, ppd,
-                                                      CNTR_INVALID_VL,
-                                                      CNTR_MODE_R,
-                                                      0);
-                               ppd->cntrs[entry->offset] = val;
-                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
-                       }
-               }
-       }
-       return ret;
-}
-
-static void free_cntrs(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       if (dd->synth_stats_timer.data)
-               del_timer_sync(&dd->synth_stats_timer);
-       dd->synth_stats_timer.data = 0;
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               kfree(ppd->cntrs);
-               kfree(ppd->scntrs);
-               free_percpu(ppd->ibport_data.rvp.rc_acks);
-               free_percpu(ppd->ibport_data.rvp.rc_qacks);
-               free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
-               ppd->cntrs = NULL;
-               ppd->scntrs = NULL;
-               ppd->ibport_data.rvp.rc_acks = NULL;
-               ppd->ibport_data.rvp.rc_qacks = NULL;
-               ppd->ibport_data.rvp.rc_delayed_comp = NULL;
-       }
-       kfree(dd->portcntrnames);
-       dd->portcntrnames = NULL;
-       kfree(dd->cntrs);
-       dd->cntrs = NULL;
-       kfree(dd->scntrs);
-       dd->scntrs = NULL;
-       kfree(dd->cntrnames);
-       dd->cntrnames = NULL;
-}
-
-#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
-#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
-
-static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
-                             u64 *psval, void *context, int vl)
-{
-       u64 val;
-       u64 sval = *psval;
-
-       if (entry->flags & CNTR_DISABLED) {
-               dd_dev_err(dd, "Counter %s not enabled", entry->name);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
-
-       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
-
-       /* If its a synthetic counter there is more work we need to do */
-       if (entry->flags & CNTR_SYNTH) {
-               if (sval == CNTR_MAX) {
-                       /* No need to read already saturated */
-                       return CNTR_MAX;
-               }
-
-               if (entry->flags & CNTR_32BIT) {
-                       /* 32bit counters can wrap multiple times */
-                       u64 upper = sval >> 32;
-                       u64 lower = (sval << 32) >> 32;
-
-                       if (lower > val) { /* hw wrapped */
-                               if (upper == CNTR_32BIT_MAX)
-                                       val = CNTR_MAX;
-                               else
-                                       upper++;
-                       }
-
-                       if (val != CNTR_MAX)
-                               val = (upper << 32) | val;
-
-               } else {
-                       /* If we rolled we are saturated */
-                       if ((val < sval) || (val > CNTR_MAX))
-                               val = CNTR_MAX;
-               }
-       }
-
-       *psval = val;
-
-       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
-
-       return val;
-}
-
-static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
-                              struct cntr_entry *entry,
-                              u64 *psval, void *context, int vl, u64 data)
-{
-       u64 val;
-
-       if (entry->flags & CNTR_DISABLED) {
-               dd_dev_err(dd, "Counter %s not enabled", entry->name);
-               return 0;
-       }
-
-       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
-
-       if (entry->flags & CNTR_SYNTH) {
-               *psval = data;
-               if (entry->flags & CNTR_32BIT) {
-                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
-                                            (data << 32) >> 32);
-                       val = data; /* return the full 64bit value */
-               } else {
-                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
-                                            data);
-               }
-       } else {
-               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
-       }
-
-       *psval = val;
-
-       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
-
-       return val;
-}
-
-u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &dev_cntrs[index];
-       sval = dd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       return read_dev_port_cntr(dd, entry, sval, dd, vl);
-}
-
-u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &dev_cntrs[index];
-       sval = dd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
-}
-
-u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &port_cntrs[index];
-       sval = ppd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
-           (index <= C_RCV_HDR_OVF_LAST)) {
-               /* We do not want to bother for disabled contexts */
-               return 0;
-       }
-
-       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
-}
-
-u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
-{
-       struct cntr_entry *entry;
-       u64 *sval;
-
-       entry = &port_cntrs[index];
-       sval = ppd->scntrs + entry->offset;
-
-       if (vl != CNTR_INVALID_VL)
-               sval += vl;
-
-       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
-           (index <= C_RCV_HDR_OVF_LAST)) {
-               /* We do not want to bother for disabled contexts */
-               return 0;
-       }
-
-       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
-}
-
-static void update_synth_timer(unsigned long opaque)
-{
-       u64 cur_tx;
-       u64 cur_rx;
-       u64 total_flits;
-       u8 update = 0;
-       int i, j, vl;
-       struct hfi1_pportdata *ppd;
-       struct cntr_entry *entry;
-
-       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
-
-       /*
-        * Rather than keep beating on the CSRs pick a minimal set that we can
-        * check to watch for potential roll over. We can do this by looking at
-        * the number of flits sent/recv. If the total flits exceeds 32bits then
-        * we have to iterate all the counters and update.
-        */
-       entry = &dev_cntrs[C_DC_RCV_FLITS];
-       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
-
-       entry = &dev_cntrs[C_DC_XMIT_FLITS];
-       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
-
-       hfi1_cdbg(
-           CNTR,
-           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
-           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
-
-       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
-               /*
-                * May not be strictly necessary to update but it won't hurt and
-                * simplifies the logic here.
-                */
-               update = 1;
-               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
-                         dd->unit);
-       } else {
-               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
-               hfi1_cdbg(CNTR,
-                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
-                         total_flits, (u64)CNTR_32BIT_MAX);
-               if (total_flits >= CNTR_32BIT_MAX) {
-                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
-                                 dd->unit);
-                       update = 1;
-               }
-       }
-
-       if (update) {
-               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
-               for (i = 0; i < DEV_CNTR_LAST; i++) {
-                       entry = &dev_cntrs[i];
-                       if (entry->flags & CNTR_VL) {
-                               for (vl = 0; vl < C_VL_COUNT; vl++)
-                                       read_dev_cntr(dd, i, vl);
-                       } else {
-                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
-                       }
-               }
-               ppd = (struct hfi1_pportdata *)(dd + 1);
-               for (i = 0; i < dd->num_pports; i++, ppd++) {
-                       for (j = 0; j < PORT_CNTR_LAST; j++) {
-                               entry = &port_cntrs[j];
-                               if (entry->flags & CNTR_VL) {
-                                       for (vl = 0; vl < C_VL_COUNT; vl++)
-                                               read_port_cntr(ppd, j, vl);
-                               } else {
-                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
-                               }
-                       }
-               }
-
-               /*
-                * We want the value in the register. The goal is to keep track
-                * of the number of "ticks" not the counter value. In other
-                * words if the register rolls we want to notice it and go ahead
-                * and force an update.
-                */
-               entry = &dev_cntrs[C_DC_XMIT_FLITS];
-               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
-                                               CNTR_MODE_R, 0);
-
-               entry = &dev_cntrs[C_DC_RCV_FLITS];
-               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
-                                               CNTR_MODE_R, 0);
-
-               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
-                         dd->unit, dd->last_tx, dd->last_rx);
-
-       } else {
-               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
-       }
-
-mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
-}
-
-#define C_MAX_NAME 13 /* 12 chars + one for /0 */
-static int init_cntrs(struct hfi1_devdata *dd)
-{
-       int i, rcv_ctxts, j;
-       size_t sz;
-       char *p;
-       char name[C_MAX_NAME];
-       struct hfi1_pportdata *ppd;
-       const char *bit_type_32 = ",32";
-       const int bit_type_32_sz = strlen(bit_type_32);
-
-       /* set up the stats timer; the add_timer is done at the end */
-       setup_timer(&dd->synth_stats_timer, update_synth_timer,
-                   (unsigned long)dd);
-
-       /***********************/
-       /* per device counters */
-       /***********************/
-
-       /* size names and determine how many we have*/
-       dd->ndevcntrs = 0;
-       sz = 0;
-
-       for (i = 0; i < DEV_CNTR_LAST; i++) {
-               if (dev_cntrs[i].flags & CNTR_DISABLED) {
-                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
-                       continue;
-               }
-
-               if (dev_cntrs[i].flags & CNTR_VL) {
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, vl_from_idx(j));
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (dev_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->ndevcntrs++;
-                       }
-               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       for (j = 0; j < dd->chip_sdma_engines; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, j);
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (dev_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->ndevcntrs++;
-                       }
-               } else {
-                       /* +1 for newline. */
-                       sz += strlen(dev_cntrs[i].name) + 1;
-                       /* Add ",32" for 32-bit counters */
-                       if (dev_cntrs[i].flags & CNTR_32BIT)
-                               sz += bit_type_32_sz;
-                       dev_cntrs[i].offset = dd->ndevcntrs;
-                       dd->ndevcntrs++;
-               }
-       }
-
-       /* allocate space for the counter values */
-       dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
-       if (!dd->cntrs)
-               goto bail;
-
-       dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
-       if (!dd->scntrs)
-               goto bail;
-
-       /* allocate space for the counter names */
-       dd->cntrnameslen = sz;
-       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
-       if (!dd->cntrnames)
-               goto bail;
-
-       /* fill in the names */
-       for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
-               if (dev_cntrs[i].flags & CNTR_DISABLED) {
-                       /* Nothing */
-               } else if (dev_cntrs[i].flags & CNTR_VL) {
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name,
-                                        vl_from_idx(j));
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (dev_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else if (dev_cntrs[i].flags & CNTR_SDMA) {
-                       for (j = 0; j < dd->chip_sdma_engines; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        dev_cntrs[i].name, j);
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (dev_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else {
-                       memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
-                       p += strlen(dev_cntrs[i].name);
-
-                       /* Counter is 32 bits */
-                       if (dev_cntrs[i].flags & CNTR_32BIT) {
-                               memcpy(p, bit_type_32, bit_type_32_sz);
-                               p += bit_type_32_sz;
-                       }
-
-                       *p++ = '\n';
-               }
-       }
-
-       /*********************/
-       /* per port counters */
-       /*********************/
-
-       /*
-        * Go through the counters for the overflows and disable the ones we
-        * don't need. This varies based on platform so we need to do it
-        * dynamically here.
-        */
-       rcv_ctxts = dd->num_rcv_contexts;
-       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
-            i <= C_RCV_HDR_OVF_LAST; i++) {
-               port_cntrs[i].flags |= CNTR_DISABLED;
-       }
-
-       /* size port counter names and determine how many we have*/
-       sz = 0;
-       dd->nportcntrs = 0;
-       for (i = 0; i < PORT_CNTR_LAST; i++) {
-               if (port_cntrs[i].flags & CNTR_DISABLED) {
-                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
-                       continue;
-               }
-
-               if (port_cntrs[i].flags & CNTR_VL) {
-                       port_cntrs[i].offset = dd->nportcntrs;
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        port_cntrs[i].name, vl_from_idx(j));
-                               sz += strlen(name);
-                               /* Add ",32" for 32-bit counters */
-                               if (port_cntrs[i].flags & CNTR_32BIT)
-                                       sz += bit_type_32_sz;
-                               sz++;
-                               dd->nportcntrs++;
-                       }
-               } else {
-                       /* +1 for newline */
-                       sz += strlen(port_cntrs[i].name) + 1;
-                       /* Add ",32" for 32-bit counters */
-                       if (port_cntrs[i].flags & CNTR_32BIT)
-                               sz += bit_type_32_sz;
-                       port_cntrs[i].offset = dd->nportcntrs;
-                       dd->nportcntrs++;
-               }
-       }
-
-       /* allocate space for the counter names */
-       dd->portcntrnameslen = sz;
-       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
-       if (!dd->portcntrnames)
-               goto bail;
-
-       /* fill in port cntr names */
-       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
-               if (port_cntrs[i].flags & CNTR_DISABLED)
-                       continue;
-
-               if (port_cntrs[i].flags & CNTR_VL) {
-                       for (j = 0; j < C_VL_COUNT; j++) {
-                               snprintf(name, C_MAX_NAME, "%s%d",
-                                        port_cntrs[i].name, vl_from_idx(j));
-                               memcpy(p, name, strlen(name));
-                               p += strlen(name);
-
-                               /* Counter is 32 bits */
-                               if (port_cntrs[i].flags & CNTR_32BIT) {
-                                       memcpy(p, bit_type_32, bit_type_32_sz);
-                                       p += bit_type_32_sz;
-                               }
-
-                               *p++ = '\n';
-                       }
-               } else {
-                       memcpy(p, port_cntrs[i].name,
-                              strlen(port_cntrs[i].name));
-                       p += strlen(port_cntrs[i].name);
-
-                       /* Counter is 32 bits */
-                       if (port_cntrs[i].flags & CNTR_32BIT) {
-                               memcpy(p, bit_type_32, bit_type_32_sz);
-                               p += bit_type_32_sz;
-                       }
-
-                       *p++ = '\n';
-               }
-       }
-
-       /* allocate per port storage for counter values */
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
-               if (!ppd->cntrs)
-                       goto bail;
-
-               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
-               if (!ppd->scntrs)
-                       goto bail;
-       }
-
-       /* CPU counters need to be allocated and zeroed */
-       if (init_cpu_counters(dd))
-               goto bail;
-
-       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
-       return 0;
-bail:
-       free_cntrs(dd);
-       return -ENOMEM;
-}
-
-static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
-{
-       switch (chip_lstate) {
-       default:
-               dd_dev_err(dd,
-                          "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
-                          chip_lstate);
-               /* fall through */
-       case LSTATE_DOWN:
-               return IB_PORT_DOWN;
-       case LSTATE_INIT:
-               return IB_PORT_INIT;
-       case LSTATE_ARMED:
-               return IB_PORT_ARMED;
-       case LSTATE_ACTIVE:
-               return IB_PORT_ACTIVE;
-       }
-}
-
-u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
-{
-       /* look at the HFI meta-states only */
-       switch (chip_pstate & 0xf0) {
-       default:
-               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
-                          chip_pstate);
-               /* fall through */
-       case PLS_DISABLED:
-               return IB_PORTPHYSSTATE_DISABLED;
-       case PLS_OFFLINE:
-               return OPA_PORTPHYSSTATE_OFFLINE;
-       case PLS_POLLING:
-               return IB_PORTPHYSSTATE_POLLING;
-       case PLS_CONFIGPHY:
-               return IB_PORTPHYSSTATE_TRAINING;
-       case PLS_LINKUP:
-               return IB_PORTPHYSSTATE_LINKUP;
-       case PLS_PHYTEST:
-               return IB_PORTPHYSSTATE_PHY_TEST;
-       }
-}
-
-/* return the OPA port logical state name */
-const char *opa_lstate_name(u32 lstate)
-{
-       static const char * const port_logical_names[] = {
-               "PORT_NOP",
-               "PORT_DOWN",
-               "PORT_INIT",
-               "PORT_ARMED",
-               "PORT_ACTIVE",
-               "PORT_ACTIVE_DEFER",
-       };
-       if (lstate < ARRAY_SIZE(port_logical_names))
-               return port_logical_names[lstate];
-       return "unknown";
-}
-
-/* return the OPA port physical state name */
-const char *opa_pstate_name(u32 pstate)
-{
-       static const char * const port_physical_names[] = {
-               "PHYS_NOP",
-               "reserved1",
-               "PHYS_POLL",
-               "PHYS_DISABLED",
-               "PHYS_TRAINING",
-               "PHYS_LINKUP",
-               "PHYS_LINK_ERR_RECOVER",
-               "PHYS_PHY_TEST",
-               "reserved8",
-               "PHYS_OFFLINE",
-               "PHYS_GANGED",
-               "PHYS_TEST",
-       };
-       if (pstate < ARRAY_SIZE(port_physical_names))
-               return port_physical_names[pstate];
-       return "unknown";
-}
-
-/*
- * Read the hardware link state and set the driver's cached value of it.
- * Return the (new) current value.
- */
-u32 get_logical_state(struct hfi1_pportdata *ppd)
-{
-       u32 new_state;
-
-       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
-       if (new_state != ppd->lstate) {
-               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
-                           opa_lstate_name(new_state), new_state);
-               ppd->lstate = new_state;
-       }
-       /*
-        * Set port status flags in the page mapped into userspace
-        * memory. Do it here to ensure a reliable state - this is
-        * the only function called by all state handling code.
-        * Always set the flags due to the fact that the cache value
-        * might have been changed explicitly outside of this
-        * function.
-        */
-       if (ppd->statusp) {
-               switch (ppd->lstate) {
-               case IB_PORT_DOWN:
-               case IB_PORT_INIT:
-                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
-                                          HFI1_STATUS_IB_READY);
-                       break;
-               case IB_PORT_ARMED:
-                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
-                       break;
-               case IB_PORT_ACTIVE:
-                       *ppd->statusp |= HFI1_STATUS_IB_READY;
-                       break;
-               }
-       }
-       return ppd->lstate;
-}
-
-/**
- * wait_logical_linkstate - wait for an IB link state change to occur
- * @ppd: port device
- * @state: the state to wait for
- * @msecs: the number of milliseconds to wait
- *
- * Wait up to msecs milliseconds for IB link state change to occur.
- * For now, take the easy polling route.
- * Returns 0 if state reached, otherwise -ETIMEDOUT.
- */
-static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
-                                 int msecs)
-{
-       unsigned long timeout;
-
-       timeout = jiffies + msecs_to_jiffies(msecs);
-       while (1) {
-               if (get_logical_state(ppd) == state)
-                       return 0;
-               if (time_after(jiffies, timeout))
-                       break;
-               msleep(20);
-       }
-       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
-
-       return -ETIMEDOUT;
-}
-
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
-{
-       u32 pstate;
-       u32 ib_pstate;
-
-       pstate = read_physical_state(ppd->dd);
-       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
-       if (ppd->last_pstate != ib_pstate) {
-               dd_dev_info(ppd->dd,
-                           "%s: physical state changed to %s (0x%x), phy 0x%x\n",
-                           __func__, opa_pstate_name(ib_pstate), ib_pstate,
-                           pstate);
-               ppd->last_pstate = ib_pstate;
-       }
-       return ib_pstate;
-}
-
-/*
- * Read/modify/write ASIC_QSFP register bits as selected by mask
- * data: 0 or 1 in the positions depending on what needs to be written
- * dir: 0 for read, 1 for write
- * mask: select by setting
- *      I2CCLK  (bit 0)
- *      I2CDATA (bit 1)
- */
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask)
-{
-       u64 qsfp_oe, target_oe;
-
-       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
-       if (mask) {
-               /* We are writing register bits, so lock access */
-               dir &= mask;
-               data &= mask;
-
-               qsfp_oe = read_csr(dd, target_oe);
-               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
-               write_csr(dd, target_oe, qsfp_oe);
-       }
-       /* We are exclusively reading bits here, but it is unlikely
-        * we'll get valid data when we set the direction of the pin
-        * in the same call, so read should call this function again
-        * to get valid data
-        */
-       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-}
-
-#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
-(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-#define SET_STATIC_RATE_CONTROL_SMASK(r) \
-(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-int hfi1_init_ctxt(struct send_context *sc)
-{
-       if (sc) {
-               struct hfi1_devdata *dd = sc->dd;
-               u64 reg;
-               u8 set = (sc->type == SC_USER ?
-                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
-                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
-               reg = read_kctxt_csr(dd, sc->hw_context,
-                                    SEND_CTXT_CHECK_ENABLE);
-               if (set)
-                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
-               else
-                       SET_STATIC_RATE_CONTROL_SMASK(reg);
-               write_kctxt_csr(dd, sc->hw_context,
-                               SEND_CTXT_CHECK_ENABLE, reg);
-       }
-       return 0;
-}
-
-int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
-{
-       int ret = 0;
-       u64 reg;
-
-       if (dd->icode != ICODE_RTL_SILICON) {
-               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
-                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
-                                   __func__);
-               return -EINVAL;
-       }
-       reg = read_csr(dd, ASIC_STS_THERM);
-       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
-                     ASIC_STS_THERM_CURR_TEMP_MASK);
-       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
-                       ASIC_STS_THERM_LO_TEMP_MASK);
-       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
-                       ASIC_STS_THERM_HI_TEMP_MASK);
-       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
-                         ASIC_STS_THERM_CRIT_TEMP_MASK);
-       /* triggers is a 3-bit value - 1 bit per trigger. */
-       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
-
-       return ret;
-}
-
-/* ========================================================================= */
-
-/*
- * Enable/disable chip from delivering interrupts.
- */
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
-{
-       int i;
-
-       /*
-        * In HFI, the mask needs to be 1 to allow interrupts.
-        */
-       if (enable) {
-               /* enable all interrupts */
-               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
-
-               init_qsfp_int(dd);
-       } else {
-               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-                       write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
-       }
-}
-
-/*
- * Clear all interrupt sources on the chip.
- */
-static void clear_all_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
-
-       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
-       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
-
-       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
-       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
-       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
-}
-
-/* Move to pcie.c? */
-static void disable_intx(struct pci_dev *pdev)
-{
-       pci_intx(pdev, 0);
-}
-
-static void clean_up_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* remove irqs - must happen before disabling/turning off */
-       if (dd->num_msix_entries) {
-               /* MSI-X */
-               struct hfi1_msix_entry *me = dd->msix_entries;
-
-               for (i = 0; i < dd->num_msix_entries; i++, me++) {
-                       if (!me->arg) /* => no irq, no affinity */
-                               continue;
-                       hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
-                       free_irq(me->msix.vector, me->arg);
-               }
-       } else {
-               /* INTx */
-               if (dd->requested_intx_irq) {
-                       free_irq(dd->pcidev->irq, dd);
-                       dd->requested_intx_irq = 0;
-               }
-       }
-
-       /* turn off interrupts */
-       if (dd->num_msix_entries) {
-               /* MSI-X */
-               pci_disable_msix(dd->pcidev);
-       } else {
-               /* INTx */
-               disable_intx(dd->pcidev);
-       }
-
-       /* clean structures */
-       kfree(dd->msix_entries);
-       dd->msix_entries = NULL;
-       dd->num_msix_entries = 0;
-}
-
-/*
- * Remap the interrupt source from the general handler to the given MSI-X
- * interrupt.
- */
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
-{
-       u64 reg;
-       int m, n;
-
-       /* clear from the handled mask of the general interrupt */
-       m = isrc / 64;
-       n = isrc % 64;
-       dd->gi_mask[m] &= ~((u64)1 << n);
-
-       /* direct the chip source to the given MSI-X interrupt */
-       m = isrc / 8;
-       n = isrc % 8;
-       reg = read_csr(dd, CCE_INT_MAP + (8 * m));
-       reg &= ~((u64)0xff << (8 * n));
-       reg |= ((u64)msix_intr & 0xff) << (8 * n);
-       write_csr(dd, CCE_INT_MAP + (8 * m), reg);
-}
-
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
-                                 int engine, int msix_intr)
-{
-       /*
-        * SDMA engine interrupt sources grouped by type, rather than
-        * engine.  Per-engine interrupts are as follows:
-        *      SDMA
-        *      SDMAProgress
-        *      SDMAIdle
-        */
-       remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-       remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-       remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
-                  msix_intr);
-}
-
-static int request_intx_irq(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
-                dd->unit);
-       ret = request_irq(dd->pcidev->irq, general_interrupt,
-                         IRQF_SHARED, dd->intx_name, dd);
-       if (ret)
-               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
-                          ret);
-       else
-               dd->requested_intx_irq = 1;
-       return ret;
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
-       int first_general, last_general;
-       int first_sdma, last_sdma;
-       int first_rx, last_rx;
-       int i, ret = 0;
-
-       /* calculate the ranges we are going to use */
-       first_general = 0;
-       last_general = first_general + 1;
-       first_sdma = last_general;
-       last_sdma = first_sdma + dd->num_sdma;
-       first_rx = last_sdma;
-       last_rx = first_rx + dd->n_krcv_queues;
-
-       /*
-        * Sanity check - the code expects all SDMA chip source
-        * interrupts to be in the same CSR, starting at bit 0.  Verify
-        * that this is true by checking the bit location of the start.
-        */
-       BUILD_BUG_ON(IS_SDMA_START % 64);
-
-       for (i = 0; i < dd->num_msix_entries; i++) {
-               struct hfi1_msix_entry *me = &dd->msix_entries[i];
-               const char *err_info;
-               irq_handler_t handler;
-               irq_handler_t thread = NULL;
-               void *arg;
-               int idx;
-               struct hfi1_ctxtdata *rcd = NULL;
-               struct sdma_engine *sde = NULL;
-
-               /* obtain the arguments to request_irq */
-               if (first_general <= i && i < last_general) {
-                       idx = i - first_general;
-                       handler = general_interrupt;
-                       arg = dd;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d", dd->unit);
-                       err_info = "general";
-                       me->type = IRQ_GENERAL;
-               } else if (first_sdma <= i && i < last_sdma) {
-                       idx = i - first_sdma;
-                       sde = &dd->per_sdma[idx];
-                       handler = sdma_interrupt;
-                       arg = sde;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d sdma%d", dd->unit, idx);
-                       err_info = "sdma";
-                       remap_sdma_interrupts(dd, idx, i);
-                       me->type = IRQ_SDMA;
-               } else if (first_rx <= i && i < last_rx) {
-                       idx = i - first_rx;
-                       rcd = dd->rcd[idx];
-                       /* no interrupt if no rcd */
-                       if (!rcd)
-                               continue;
-                       /*
-                        * Set the interrupt register and mask for this
-                        * context's interrupt.
-                        */
-                       rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-                       rcd->imask = ((u64)1) <<
-                                       ((IS_RCVAVAIL_START + idx) % 64);
-                       handler = receive_context_interrupt;
-                       thread = receive_context_thread;
-                       arg = rcd;
-                       snprintf(me->name, sizeof(me->name),
-                                DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-                       err_info = "receive context";
-                       remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-                       me->type = IRQ_RCVCTXT;
-               } else {
-                       /* not in our expected range - complain, then
-                        * ignore it
-                        */
-                       dd_dev_err(dd,
-                                  "Unexpected extra MSI-X interrupt %d\n", i);
-                       continue;
-               }
-               /* no argument, no interrupt */
-               if (!arg)
-                       continue;
-               /* make sure the name is terminated */
-               me->name[sizeof(me->name) - 1] = 0;
-
-               ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
-                                          me->name, arg);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
-                                  err_info, me->msix.vector, idx, ret);
-                       return ret;
-               }
-               /*
-                * assign arg after request_irq call, so it will be
-                * cleaned up
-                */
-               me->arg = arg;
-
-               ret = hfi1_get_irq_affinity(dd, me);
-               if (ret)
-                       dd_dev_err(dd,
-                                  "unable to pin IRQ %d\n", ret);
-       }
-
-       return ret;
-}
-
-/*
- * Set the general handler to accept all interrupts, remap all
- * chip interrupts back to MSI-X 0.
- */
-static void reset_interrupts(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* all interrupts handled by the general handler */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               dd->gi_mask[i] = ~(u64)0;
-
-       /* all chip interrupts map to MSI-X 0 */
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
-}
-
-static int set_up_interrupts(struct hfi1_devdata *dd)
-{
-       struct hfi1_msix_entry *entries;
-       u32 total, request;
-       int i, ret;
-       int single_interrupt = 0; /* we expect to have all the interrupts */
-
-       /*
-        * Interrupt count:
-        *      1 general, "slow path" interrupt (includes the SDMA engines
-        *              slow source, SDMACleanupDone)
-        *      N interrupts - one per used SDMA engine
-        *      M interrupt - one per kernel receive context
-        */
-       total = 1 + dd->num_sdma + dd->n_krcv_queues;
-
-       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
-       if (!entries) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-       /* 1-1 MSI-X entry assignment */
-       for (i = 0; i < total; i++)
-               entries[i].msix.entry = i;
-
-       /* ask for MSI-X interrupts */
-       request = total;
-       request_msix(dd, &request, entries);
-
-       if (request == 0) {
-               /* using INTx */
-               /* dd->num_msix_entries already zero */
-               kfree(entries);
-               single_interrupt = 1;
-               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
-       } else {
-               /* using MSI-X */
-               dd->num_msix_entries = request;
-               dd->msix_entries = entries;
-
-               if (request != total) {
-                       /* using MSI-X, with reduced interrupts */
-                       dd_dev_err(
-                               dd,
-                               "cannot handle reduced interrupt case, want %u, got %u\n",
-                               total, request);
-                       ret = -EINVAL;
-                       goto fail;
-               }
-               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
-       }
-
-       /* mask all interrupts */
-       set_intr_state(dd, 0);
-       /* clear all pending interrupts */
-       clear_all_interrupts(dd);
-
-       /* reset general handler mask, chip MSI-X mappings */
-       reset_interrupts(dd);
-
-       if (single_interrupt)
-               ret = request_intx_irq(dd);
-       else
-               ret = request_msix_irqs(dd);
-       if (ret)
-               goto fail;
-
-       return 0;
-
-fail:
-       clean_up_interrupts(dd);
-       return ret;
-}
-
-/*
- * Set up context values in dd.  Sets:
- *
- *     num_rcv_contexts - number of contexts being used
- *     n_krcv_queues - number of kernel contexts
- *     first_user_ctxt - first non-kernel context in array of contexts
- *     freectxts  - number of free user contexts
- *     num_send_contexts - number of PIO send contexts being used
- */
-static int set_up_context_variables(struct hfi1_devdata *dd)
-{
-       int num_kernel_contexts;
-       int total_contexts;
-       int ret;
-       unsigned ngroups;
-       int qos_rmt_count;
-       int user_rmt_reduced;
-
-       /*
-        * Kernel receive contexts:
-        * - min of 2 or 1 context/numa (excluding control context)
-        * - Context 0 - control context (VL15/multicast/error)
-        * - Context 1 - first kernel context
-        * - Context 2 - second kernel context
-        * ...
-        */
-       if (n_krcvqs)
-               /*
-                * n_krcvqs is the sum of module parameter kernel receive
-                * contexts, krcvqs[].  It does not include the control
-                * context, so add that.
-                */
-               num_kernel_contexts = n_krcvqs + 1;
-       else
-               num_kernel_contexts = num_online_nodes() + 1;
-       num_kernel_contexts =
-               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
-       /*
-        * Every kernel receive context needs an ACK send context.
-        * one send context is allocated for each VL{0-7} and VL15
-        */
-       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
-               dd_dev_err(dd,
-                          "Reducing # kernel rcv contexts to: %d, from %d\n",
-                          (int)(dd->chip_send_contexts - num_vls - 1),
-                          (int)num_kernel_contexts);
-               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
-       }
-       /*
-        * User contexts:
-        *      - default to 1 user context per real (non-HT) CPU core if
-        *        num_user_contexts is negative
-        */
-       if (num_user_contexts < 0)
-               num_user_contexts =
-                       cpumask_weight(&dd->affinity->real_cpu_mask);
-
-       total_contexts = num_kernel_contexts + num_user_contexts;
-
-       /*
-        * Adjust the counts given a global max.
-        */
-       if (total_contexts > dd->chip_rcv_contexts) {
-               dd_dev_err(dd,
-                          "Reducing # user receive contexts to: %d, from %d\n",
-                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
-                          (int)num_user_contexts);
-               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
-               /* recalculate */
-               total_contexts = num_kernel_contexts + num_user_contexts;
-       }
-
-       /* each user context requires an entry in the RMT */
-       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
-               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
-               dd_dev_err(dd,
-                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
-                          (int)num_user_contexts,
-                          user_rmt_reduced);
-               /* recalculate */
-               num_user_contexts = user_rmt_reduced;
-               total_contexts = num_kernel_contexts + num_user_contexts;
-       }
-
-       /* the first N are kernel contexts, the rest are user contexts */
-       dd->num_rcv_contexts = total_contexts;
-       dd->n_krcv_queues = num_kernel_contexts;
-       dd->first_user_ctxt = num_kernel_contexts;
-       dd->num_user_contexts = num_user_contexts;
-       dd->freectxts = num_user_contexts;
-       dd_dev_info(dd,
-                   "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
-                   (int)dd->chip_rcv_contexts,
-                   (int)dd->num_rcv_contexts,
-                   (int)dd->n_krcv_queues,
-                   (int)dd->num_rcv_contexts - dd->n_krcv_queues);
-
-       /*
-        * Receive array allocation:
-        *   All RcvArray entries are divided into groups of 8. This
-        *   is required by the hardware and will speed up writes to
-        *   consecutive entries by using write-combining of the entire
-        *   cacheline.
-        *
-        *   The number of groups are evenly divided among all contexts.
-        *   any left over groups will be given to the first N user
-        *   contexts.
-        */
-       dd->rcv_entries.group_size = RCV_INCREMENT;
-       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
-       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
-       dd->rcv_entries.nctxt_extra = ngroups -
-               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
-       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
-                   dd->rcv_entries.ngroups,
-                   dd->rcv_entries.nctxt_extra);
-       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
-           MAX_EAGER_ENTRIES * 2) {
-               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
-                       dd->rcv_entries.group_size;
-               dd_dev_info(dd,
-                           "RcvArray group count too high, change to %u\n",
-                           dd->rcv_entries.ngroups);
-               dd->rcv_entries.nctxt_extra = 0;
-       }
-       /*
-        * PIO send contexts
-        */
-       ret = init_sc_pools_and_sizes(dd);
-       if (ret >= 0) { /* success */
-               dd->num_send_contexts = ret;
-               dd_dev_info(
-                       dd,
-                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
-                       dd->chip_send_contexts,
-                       dd->num_send_contexts,
-                       dd->sc_sizes[SC_KERNEL].count,
-                       dd->sc_sizes[SC_ACK].count,
-                       dd->sc_sizes[SC_USER].count,
-                       dd->sc_sizes[SC_VL15].count);
-               ret = 0;        /* success */
-       }
-
-       return ret;
-}
-
-/*
- * Set the device/port partition key table. The MAD code
- * will ensure that, at least, the partial management
- * partition key is present in the table.
- */
-static void set_partition_keys(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg = 0;
-       int i;
-
-       dd_dev_info(dd, "Setting partition keys\n");
-       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
-               reg |= (ppd->pkeys[i] &
-                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
-                       ((i % 4) *
-                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
-               /* Each register holds 4 PKey values. */
-               if ((i % 4) == 3) {
-                       write_csr(dd, RCV_PARTITION_KEY +
-                                 ((i - 3) * 2), reg);
-                       reg = 0;
-               }
-       }
-
-       /* Always enable HW pkeys check when pkeys table is set */
-       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
-}
-
-/*
- * These CSRs and memories are uninitialized on reset and must be
- * written before reading to set the ECC/parity bits.
- *
- * NOTE: All user context CSRs that are not mmaped write-only
- * (e.g. the TID flows) must be initialized even if the driver never
- * reads them.
- */
-static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
-{
-       int i, j;
-
-       /* CceIntMap */
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP + (8 * i), 0);
-
-       /* SendCtxtCreditReturnAddr */
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
-
-       /* PIO Send buffers */
-       /* SDMA Send buffers */
-       /*
-        * These are not normally read, and (presently) have no method
-        * to be read, so are not pre-initialized
-        */
-
-       /* RcvHdrAddr */
-       /* RcvHdrTailAddr */
-       /* RcvTidFlowTable */
-       for (i = 0; i < dd->chip_rcv_contexts; i++) {
-               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
-               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
-                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
-       }
-
-       /* RcvArray */
-       for (i = 0; i < dd->chip_rcv_array_count; i++)
-               write_csr(dd, RCV_ARRAY + (8 * i),
-                         RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
-
-       /* RcvQPMapTable */
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
-}
-
-/*
- * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
- */
-static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
-                            u64 ctrl_bits)
-{
-       unsigned long timeout;
-       u64 reg;
-
-       /* is the condition present? */
-       reg = read_csr(dd, CCE_STATUS);
-       if ((reg & status_bits) == 0)
-               return;
-
-       /* clear the condition */
-       write_csr(dd, CCE_CTRL, ctrl_bits);
-
-       /* wait for the condition to clear */
-       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
-       while (1) {
-               reg = read_csr(dd, CCE_STATUS);
-               if ((reg & status_bits) == 0)
-                       return;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(dd,
-                                  "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
-                                  status_bits, reg & status_bits);
-                       return;
-               }
-               udelay(1);
-       }
-}
-
-/* set CCE CSRs to chip reset defaults */
-static void reset_cce_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* CCE_REVISION read-only */
-       /* CCE_REVISION2 read-only */
-       /* CCE_CTRL - bits clear automatically */
-       /* CCE_STATUS read-only, use CceCtrl to clear */
-       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
-       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
-       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
-       for (i = 0; i < CCE_NUM_SCRATCH; i++)
-               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
-       /* CCE_ERR_STATUS read-only */
-       write_csr(dd, CCE_ERR_MASK, 0);
-       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
-       /* CCE_ERR_FORCE leave alone */
-       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
-               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
-       /* CCE_PCIE_CTRL leave alone */
-       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
-               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
-               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
-                         CCE_MSIX_TABLE_UPPER_RESETCSR);
-       }
-       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
-               /* CCE_MSIX_PBA read-only */
-               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
-               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
-       }
-       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
-               write_csr(dd, CCE_INT_MAP, 0);
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-               /* CCE_INT_STATUS read-only */
-               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
-               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
-               /* CCE_INT_FORCE leave alone */
-               /* CCE_INT_BLOCKED read-only */
-       }
-       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
-               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
-}
-
-/* set MISC CSRs to chip reset defaults */
-static void reset_misc_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < 32; i++) {
-               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
-               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
-               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
-       }
-       /*
-        * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
-        * only be written 128-byte chunks
-        */
-       /* init RSA engine to clear lingering errors */
-       write_csr(dd, MISC_CFG_RSA_CMD, 1);
-       write_csr(dd, MISC_CFG_RSA_MU, 0);
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-       /* MISC_STS_8051_DIGEST read-only */
-       /* MISC_STS_SBM_DIGEST read-only */
-       /* MISC_STS_PCIE_DIGEST read-only */
-       /* MISC_STS_FAB_DIGEST read-only */
-       /* MISC_ERR_STATUS read-only */
-       write_csr(dd, MISC_ERR_MASK, 0);
-       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
-       /* MISC_ERR_FORCE leave alone */
-}
-
-/* set TXE CSRs to chip reset defaults */
-static void reset_txe_csrs(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * TXE Kernel CSRs
-        */
-       write_csr(dd, SEND_CTRL, 0);
-       __cm_reset(dd, 0);      /* reset CM internal state */
-       /* SEND_CONTEXTS read-only */
-       /* SEND_DMA_ENGINES read-only */
-       /* SEND_PIO_MEM_SIZE read-only */
-       /* SEND_DMA_MEM_SIZE read-only */
-       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
-       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
-       /* SEND_PIO_ERR_STATUS read-only */
-       write_csr(dd, SEND_PIO_ERR_MASK, 0);
-       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
-       /* SEND_PIO_ERR_FORCE leave alone */
-       /* SEND_DMA_ERR_STATUS read-only */
-       write_csr(dd, SEND_DMA_ERR_MASK, 0);
-       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
-       /* SEND_DMA_ERR_FORCE leave alone */
-       /* SEND_EGRESS_ERR_STATUS read-only */
-       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
-       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
-       /* SEND_EGRESS_ERR_FORCE leave alone */
-       write_csr(dd, SEND_BTH_QP, 0);
-       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
-       write_csr(dd, SEND_SC2VLT0, 0);
-       write_csr(dd, SEND_SC2VLT1, 0);
-       write_csr(dd, SEND_SC2VLT2, 0);
-       write_csr(dd, SEND_SC2VLT3, 0);
-       write_csr(dd, SEND_LEN_CHECK0, 0);
-       write_csr(dd, SEND_LEN_CHECK1, 0);
-       /* SEND_ERR_STATUS read-only */
-       write_csr(dd, SEND_ERR_MASK, 0);
-       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
-       /* SEND_ERR_FORCE read-only */
-       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
-       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
-               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
-       for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
-               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
-       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
-       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
-               write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
-       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
-       write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
-       /* SEND_CM_CREDIT_USED_STATUS read-only */
-       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
-       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
-       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
-       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
-       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
-       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
-       /* SEND_CM_CREDIT_USED_VL read-only */
-       /* SEND_CM_CREDIT_USED_VL15 read-only */
-       /* SEND_EGRESS_CTXT_STATUS read-only */
-       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
-       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
-       /* SEND_EGRESS_ERR_INFO read-only */
-       /* SEND_EGRESS_ERR_SOURCE read-only */
-
-       /*
-        * TXE Per-Context CSRs
-        */
-       for (i = 0; i < dd->chip_send_contexts; i++) {
-               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
-               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
-       }
-
-       /*
-        * TXE Per-SDMA CSRs
-        */
-       for (i = 0; i < dd->chip_sdma_engines; i++) {
-               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
-               /* SEND_DMA_STATUS read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
-               /* SEND_DMA_HEAD read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
-               /* SEND_DMA_IDLE_CNT read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
-               /* SEND_DMA_DESC_FETCHED_CNT read-only */
-               /* SEND_DMA_ENG_ERR_STATUS read-only */
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
-               /* SEND_DMA_ENG_ERR_FORCE leave alone */
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
-               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
-       }
-}
-
-/*
- * Expect on entry:
- * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
- */
-static void init_rbufs(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       int count;
-
-       /*
-        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
-        * clear.
-        */
-       count = 0;
-       while (1) {
-               reg = read_csr(dd, RCV_STATUS);
-               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
-                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
-                       break;
-               /*
-                * Give up after 1ms - maximum wait time.
-                *
-                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
-                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
-                *      148 KB / (66% * 250MB/s) = 920us
-                */
-               if (count++ > 500) {
-                       dd_dev_err(dd,
-                                  "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
-                                  __func__, reg);
-                       break;
-               }
-               udelay(2); /* do not busy-wait the CSR */
-       }
-
-       /* start the init - expect RcvCtrl to be 0 */
-       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
-
-       /*
-        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
-        * period after the write before RcvStatus.RxRbufInitDone is valid.
-        * The delay in the first run through the loop below is sufficient and
-        * required before the first read of RcvStatus.RxRbufInintDone.
-        */
-       read_csr(dd, RCV_CTRL);
-
-       /* wait for the init to finish */
-       count = 0;
-       while (1) {
-               /* delay is required first time through - see above */
-               udelay(2); /* do not busy-wait the CSR */
-               reg = read_csr(dd, RCV_STATUS);
-               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
-                       break;
-
-               /* give up after 100us - slowest possible at 33MHz is 73us */
-               if (count++ > 50) {
-                       dd_dev_err(dd,
-                                  "%s: RcvStatus.RxRbufInit not set, continuing\n",
-                                  __func__);
-                       break;
-               }
-       }
-}
-
-/* set RXE CSRs to chip reset defaults */
-static void reset_rxe_csrs(struct hfi1_devdata *dd)
-{
-       int i, j;
-
-       /*
-        * RXE Kernel CSRs
-        */
-       write_csr(dd, RCV_CTRL, 0);
-       init_rbufs(dd);
-       /* RCV_STATUS read-only */
-       /* RCV_CONTEXTS read-only */
-       /* RCV_ARRAY_CNT read-only */
-       /* RCV_BUF_SIZE read-only */
-       write_csr(dd, RCV_BTH_QP, 0);
-       write_csr(dd, RCV_MULTICAST, 0);
-       write_csr(dd, RCV_BYPASS, 0);
-       write_csr(dd, RCV_VL15, 0);
-       /* this is a clear-down */
-       write_csr(dd, RCV_ERR_INFO,
-                 RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
-       /* RCV_ERR_STATUS read-only */
-       write_csr(dd, RCV_ERR_MASK, 0);
-       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
-       /* RCV_ERR_FORCE leave alone */
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
-       for (i = 0; i < 4; i++)
-               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
-               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
-               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
-       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
-               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
-               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
-               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
-       }
-       for (i = 0; i < 32; i++)
-               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
-
-       /*
-        * RXE Kernel and User Per-Context CSRs
-        */
-       for (i = 0; i < dd->chip_rcv_contexts; i++) {
-               /* kernel */
-               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
-               /* RCV_CTXT_STATUS read-only */
-               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
-               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
-               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
-
-               /* user */
-               /* RCV_HDR_TAIL read-only */
-               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
-               /* RCV_EGR_INDEX_TAIL read-only */
-               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
-               /* RCV_EGR_OFFSET_TAIL read-only */
-               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
-                       write_uctxt_csr(dd, i,
-                                       RCV_TID_FLOW_TABLE + (8 * j), 0);
-               }
-       }
-}
-
-/*
- * Set sc2vl tables.
- *
- * They power on to zeros, so to avoid send context errors
- * they need to be set:
- *
- * SC 0-7 -> VL 0-7 (respectively)
- * SC 15  -> VL 15
- * otherwise
- *        -> VL 0
- */
-static void init_sc2vl_tables(struct hfi1_devdata *dd)
-{
-       int i;
-       /* init per architecture spec, constrained by hardware capability */
-
-       /* HFI maps sent packets */
-       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
-               0,
-               0, 0, 1, 1,
-               2, 2, 3, 3,
-               4, 4, 5, 5,
-               6, 6, 7, 7));
-       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
-               1,
-               8, 0, 9, 0,
-               10, 0, 11, 0,
-               12, 0, 13, 0,
-               14, 0, 15, 15));
-       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
-               2,
-               16, 0, 17, 0,
-               18, 0, 19, 0,
-               20, 0, 21, 0,
-               22, 0, 23, 0));
-       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
-               3,
-               24, 0, 25, 0,
-               26, 0, 27, 0,
-               28, 0, 29, 0,
-               30, 0, 31, 0));
-
-       /* DC maps received packets */
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
-               15_0,
-               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
-               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
-       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
-               31_16,
-               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
-               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
-
-       /* initialize the cached sc2vl values consistently with h/w */
-       for (i = 0; i < 32; i++) {
-               if (i < 8 || i == 15)
-                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
-               else
-                       *((u8 *)(dd->sc2vl) + i) = 0;
-       }
-}
-
-/*
- * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
- * depend on the chip going through a power-on reset - a driver may be loaded
- * and unloaded many times.
- *
- * Do not write any CSR values to the chip in this routine - there may be
- * a reset following the (possible) FLR in this routine.
- *
- */
-static void init_chip(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * Put the HFI CSRs in a known state.
-        * Combine this with a DC reset.
-        *
-        * Stop the device from doing anything while we do a
-        * reset.  We know there are no other active users of
-        * the device since we are now in charge.  Turn off
-        * off all outbound and inbound traffic and make sure
-        * the device does not generate any interrupts.
-        */
-
-       /* disable send contexts and SDMA engines */
-       write_csr(dd, SEND_CTRL, 0);
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
-       /* disable port (turn off RXE inbound traffic) and contexts */
-       write_csr(dd, RCV_CTRL, 0);
-       for (i = 0; i < dd->chip_rcv_contexts; i++)
-               write_csr(dd, RCV_CTXT_CTRL, 0);
-       /* mask all interrupt sources */
-       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-               write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
-
-       /*
-        * DC Reset: do a full DC reset before the register clear.
-        * A recommended length of time to hold is one CSR read,
-        * so reread the CceDcCtrl.  Then, hold the DC in reset
-        * across the clear.
-        */
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void)read_csr(dd, CCE_DC_CTRL);
-
-       if (use_flr) {
-               /*
-                * A FLR will reset the SPC core and part of the PCIe.
-                * The parts that need to be restored have already been
-                * saved.
-                */
-               dd_dev_info(dd, "Resetting CSRs with FLR\n");
-
-               /* do the FLR, the DC reset will remain */
-               hfi1_pcie_flr(dd);
-
-               /* restore command and BARs */
-               restore_pci_variables(dd);
-
-               if (is_ax(dd)) {
-                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
-                       hfi1_pcie_flr(dd);
-                       restore_pci_variables(dd);
-               }
-       } else {
-               dd_dev_info(dd, "Resetting CSRs with writes\n");
-               reset_cce_csrs(dd);
-               reset_txe_csrs(dd);
-               reset_rxe_csrs(dd);
-               reset_misc_csrs(dd);
-       }
-       /* clear the DC reset */
-       write_csr(dd, CCE_DC_CTRL, 0);
-
-       /* Set the LED off */
-       setextled(dd, 0);
-
-       /*
-        * Clear the QSFP reset.
-        * An FLR enforces a 0 on all out pins. The driver does not touch
-        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
-        * anything plugged constantly in reset, if it pays attention
-        * to RESET_N.
-        * Prime examples of this are optical cables. Set all pins high.
-        * I2CCLK and I2CDAT will change per direction, and INT_N and
-        * MODPRS_N are input only and their value is ignored.
-        */
-       write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
-       write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
-       init_chip_resources(dd);
-}
-
-static void init_early_variables(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* assign link credit variables */
-       dd->vau = CM_VAU;
-       dd->link_credits = CM_GLOBAL_CREDITS;
-       if (is_ax(dd))
-               dd->link_credits--;
-       dd->vcu = cu_to_vcu(hfi1_cu);
-       /* enough room for 8 MAD packets plus header - 17K */
-       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
-       if (dd->vl15_init > dd->link_credits)
-               dd->vl15_init = dd->link_credits;
-
-       write_uninitialized_csrs_and_memories(dd);
-
-       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_pportdata *ppd = &dd->pport[i];
-
-                       set_partition_keys(ppd);
-               }
-       init_sc2vl_tables(dd);
-}
-
-static void init_kdeth_qp(struct hfi1_devdata *dd)
-{
-       /* user changed the KDETH_QP */
-       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
-               /* out of range or illegal value */
-               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
-               kdeth_qp = 0;
-       }
-       if (kdeth_qp == 0)      /* not set, or failed range check */
-               kdeth_qp = DEFAULT_KDETH_QP;
-
-       write_csr(dd, SEND_BTH_QP,
-                 (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
-                 SEND_BTH_QP_KDETH_QP_SHIFT);
-
-       write_csr(dd, RCV_BTH_QP,
-                 (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
-                 RCV_BTH_QP_KDETH_QP_SHIFT);
-}
-
-/**
- * init_qpmap_table
- * @dd - device data
- * @first_ctxt - first context
- * @last_ctxt - first context
- *
- * This return sets the qpn mapping table that
- * is indexed by qpn[8:1].
- *
- * The routine will round robin the 256 settings
- * from first_ctxt to last_ctxt.
- *
- * The first/last looks ahead to having specialized
- * receive contexts for mgmt and bypass.  Normal
- * verbs traffic will assumed to be on a range
- * of receive contexts.
- */
-static void init_qpmap_table(struct hfi1_devdata *dd,
-                            u32 first_ctxt,
-                            u32 last_ctxt)
-{
-       u64 reg = 0;
-       u64 regno = RCV_QP_MAP_TABLE;
-       int i;
-       u64 ctxt = first_ctxt;
-
-       for (i = 0; i < 256; i++) {
-               reg |= ctxt << (8 * (i % 8));
-               ctxt++;
-               if (ctxt > last_ctxt)
-                       ctxt = first_ctxt;
-               if (i % 8 == 7) {
-                       write_csr(dd, regno, reg);
-                       reg = 0;
-                       regno += 8;
-               }
-       }
-
-       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
-                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
-}
-
-struct rsm_map_table {
-       u64 map[NUM_MAP_REGS];
-       unsigned int used;
-};
-
-struct rsm_rule_data {
-       u8 offset;
-       u8 pkt_type;
-       u32 field1_off;
-       u32 field2_off;
-       u32 index1_off;
-       u32 index1_width;
-       u32 index2_off;
-       u32 index2_width;
-       u32 mask1;
-       u32 value1;
-       u32 mask2;
-       u32 value2;
-};
-
-/*
- * Return an initialized RMT map table for users to fill in.  OK if it
- * returns NULL, indicating no table.
- */
-static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
-{
-       struct rsm_map_table *rmt;
-       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
-
-       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
-       if (rmt) {
-               memset(rmt->map, rxcontext, sizeof(rmt->map));
-               rmt->used = 0;
-       }
-
-       return rmt;
-}
-
-/*
- * Write the final RMT map table to the chip and free the table.  OK if
- * table is NULL.
- */
-static void complete_rsm_map_table(struct hfi1_devdata *dd,
-                                  struct rsm_map_table *rmt)
-{
-       int i;
-
-       if (rmt) {
-               /* write table to chip */
-               for (i = 0; i < NUM_MAP_REGS; i++)
-                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
-
-               /* enable RSM */
-               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-       }
-}
-
-/*
- * Add a receive side mapping rule.
- */
-static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
-                        struct rsm_rule_data *rrd)
-{
-       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
-                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
-                 1ull << rule_index | /* enable bit */
-                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
-                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
-                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
-                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
-                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
-                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
-}
-
-/* return the number of RSM map table entries that will be used for QOS */
-static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
-                          unsigned int *np)
-{
-       int i;
-       unsigned int m, n;
-       u8 max_by_vl = 0;
-
-       /* is QOS active at all? */
-       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
-           num_vls == 1 ||
-           krcvqsset <= 1)
-               goto no_qos;
-
-       /* determine bits for qpn */
-       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
-               if (krcvqs[i] > max_by_vl)
-                       max_by_vl = krcvqs[i];
-       if (max_by_vl > 32)
-               goto no_qos;
-       m = ilog2(__roundup_pow_of_two(max_by_vl));
-
-       /* determine bits for vl */
-       n = ilog2(__roundup_pow_of_two(num_vls));
-
-       /* reject if too much is used */
-       if ((m + n) > 7)
-               goto no_qos;
-
-       if (mp)
-               *mp = m;
-       if (np)
-               *np = n;
-
-       return 1 << (m + n);
-
-no_qos:
-       if (mp)
-               *mp = 0;
-       if (np)
-               *np = 0;
-       return 0;
-}
-
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @rmt - RSM map table
- *
- * This routine initializes Rule 0 and the RSM map table to implement
- * quality of service (qos).
- *
- * If all of the limit tests succeed, qos is applied based on the array
- * interpretation of krcvqs where entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn bits (m) are computed to
- * feed both the RSM map table and the single rule.
- */
-static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
-{
-       struct rsm_rule_data rrd;
-       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-       unsigned int rmt_entries;
-       u64 reg;
-
-       if (!rmt)
-               goto bail;
-       rmt_entries = qos_rmt_entries(dd, &m, &n);
-       if (rmt_entries == 0)
-               goto bail;
-       qpns_per_vl = 1 << m;
-
-       /* enough room in the map table? */
-       rmt_entries = 1 << (m + n);
-       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
-               goto bail;
-
-       /* add qos entries to the the RSM map table */
-       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
-               unsigned tctxt;
-
-               for (qpn = 0, tctxt = ctxt;
-                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
-                       unsigned idx, regoff, regidx;
-
-                       /* generate the index the hardware will produce */
-                       idx = rmt->used + ((qpn << n) ^ i);
-                       regoff = (idx % 8) * 8;
-                       regidx = idx / 8;
-                       /* replace default with context number */
-                       reg = rmt->map[regidx];
-                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
-                               << regoff);
-                       reg |= (u64)(tctxt++) << regoff;
-                       rmt->map[regidx] = reg;
-                       if (tctxt == ctxt + krcvqs[i])
-                               tctxt = ctxt;
-               }
-               ctxt += krcvqs[i];
-       }
-
-       rrd.offset = rmt->used;
-       rrd.pkt_type = 2;
-       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
-       rrd.field2_off = LRH_SC_MATCH_OFFSET;
-       rrd.index1_off = LRH_SC_SELECT_OFFSET;
-       rrd.index1_width = n;
-       rrd.index2_off = QPN_SELECT_OFFSET;
-       rrd.index2_width = m + n;
-       rrd.mask1 = LRH_BTH_MASK;
-       rrd.value1 = LRH_BTH_VALUE;
-       rrd.mask2 = LRH_SC_MASK;
-       rrd.value2 = LRH_SC_VALUE;
-
-       /* add rule 0 */
-       add_rsm_rule(dd, 0, &rrd);
-
-       /* mark RSM map entries as used */
-       rmt->used += rmt_entries;
-       /* map everything else to the mcast/err/vl15 context */
-       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
-       dd->qos_shift = n + 1;
-       return;
-bail:
-       dd->qos_shift = 1;
-       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
-}
-
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-                                   struct rsm_map_table *rmt)
-{
-       struct rsm_rule_data rrd;
-       u64 reg;
-       int i, idx, regoff, regidx;
-       u8 offset;
-
-       /* there needs to be enough room in the map table */
-       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
-               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
-               return;
-       }
-
-       /*
-        * RSM will extract the destination context as an index into the
-        * map table.  The destination contexts are a sequential block
-        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
-        * Map entries are accessed as offset + extracted value.  Adjust
-        * the added offset so this sequence can be placed anywhere in
-        * the table - as long as the entries themselves do not wrap.
-        * There are only enough bits in offset for the table size, so
-        * start with that to allow for a "negative" offset.
-        */
-       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-                                               (int)dd->first_user_ctxt);
-
-       for (i = dd->first_user_ctxt, idx = rmt->used;
-                               i < dd->num_rcv_contexts; i++, idx++) {
-               /* replace with identity mapping */
-               regoff = (idx % 8) * 8;
-               regidx = idx / 8;
-               reg = rmt->map[regidx];
-               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
-               reg |= (u64)i << regoff;
-               rmt->map[regidx] = reg;
-       }
-
-       /*
-        * For RSM intercept of Expected FECN packets:
-        * o packet type 0 - expected
-        * o match on F (bit 95), using select/match 1, and
-        * o match on SH (bit 133), using select/match 2.
-        *
-        * Use index 1 to extract the 8-bit receive context from DestQP
-        * (start at bit 64).  Use that as the RSM map table index.
-        */
-       rrd.offset = offset;
-       rrd.pkt_type = 0;
-       rrd.field1_off = 95;
-       rrd.field2_off = 133;
-       rrd.index1_off = 64;
-       rrd.index1_width = 8;
-       rrd.index2_off = 0;
-       rrd.index2_width = 0;
-       rrd.mask1 = 1;
-       rrd.value1 = 1;
-       rrd.mask2 = 1;
-       rrd.value2 = 1;
-
-       /* add rule 1 */
-       add_rsm_rule(dd, 1, &rrd);
-
-       rmt->used += dd->num_user_contexts;
-}
-
-static void init_rxe(struct hfi1_devdata *dd)
-{
-       struct rsm_map_table *rmt;
-
-       /* enable all receive errors */
-       write_csr(dd, RCV_ERR_MASK, ~0ull);
-
-       rmt = alloc_rsm_map_table(dd);
-       /* set up QOS, including the QPN map table */
-       init_qos(dd, rmt);
-       init_user_fecn_handling(dd, rmt);
-       complete_rsm_map_table(dd, rmt);
-       kfree(rmt);
-
-       /*
-        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
-        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
-        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
-        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
-        * Max_PayLoad_Size set to its minimum of 128.
-        *
-        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
-        * (64 bytes).  Max_Payload_Size is possibly modified upward in
-        * tune_pcie_caps() which is called after this routine.
-        */
-}
-
-static void init_other(struct hfi1_devdata *dd)
-{
-       /* enable all CCE errors */
-       write_csr(dd, CCE_ERR_MASK, ~0ull);
-       /* enable *some* Misc errors */
-       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
-       /* enable all DC errors, except LCB */
-       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
-       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
-}
-
-/*
- * Fill out the given AU table using the given CU.  A CU is defined in terms
- * AUs.  The table is a an encoding: given the index, how many AUs does that
- * represent?
- *
- * NOTE: Assumes that the register layout is the same for the
- * local and remote tables.
- */
-static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
-                              u32 csr0to3, u32 csr4to7)
-{
-       write_csr(dd, csr0to3,
-                 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
-                 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
-                 2ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
-                 4ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
-       write_csr(dd, csr4to7,
-                 8ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
-                 16ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
-                 32ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
-                 64ull * cu <<
-                 SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
-}
-
-static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
-{
-       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
-                          SEND_CM_LOCAL_AU_TABLE4_TO7);
-}
-
-void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
-{
-       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
-                          SEND_CM_REMOTE_AU_TABLE4_TO7);
-}
-
-static void init_txe(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* enable all PIO, SDMA, general, and Egress errors */
-       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_ERR_MASK, ~0ull);
-       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
-
-       /* enable all per-context and per-SDMA engine errors */
-       for (i = 0; i < dd->chip_send_contexts; i++)
-               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
-       for (i = 0; i < dd->chip_sdma_engines; i++)
-               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
-
-       /* set the local CU to AU mapping */
-       assign_local_cm_au_table(dd, dd->vcu);
-
-       /*
-        * Set reasonable default for Credit Return Timer
-        * Don't set on Simulator - causes it to choke.
-        */
-       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
-               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
-}
-
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
-{
-       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
-               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
-                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
-       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
-       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
-               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
-       /*
-        * Enable send-side J_KEY integrity check, unless this is A0 h/w
-        */
-       if (!is_ax(dd)) {
-               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       }
-
-       /* Enable J_KEY check on receive context. */
-       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
-               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
-                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
-       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
-done:
-       return ret;
-}
-
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
-{
-       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
-       /*
-        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
-        * This check would not have been enabled for A0 h/w, see
-        * set_ctxt_jkey().
-        */
-       if (!is_ax(dd)) {
-               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       }
-       /* Turn off the J_KEY on the receive side */
-       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
-done:
-       return ret;
-}
-
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (ctxt < dd->num_rcv_contexts) {
-               rcd = dd->rcd[ctxt];
-       } else {
-               ret = -EINVAL;
-               goto done;
-       }
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
-               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
-       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
-       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-done:
-       return ret;
-}
-
-int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned sctxt;
-       int ret = 0;
-       u64 reg;
-
-       if (ctxt < dd->num_rcv_contexts) {
-               rcd = dd->rcd[ctxt];
-       } else {
-               ret = -EINVAL;
-               goto done;
-       }
-       if (!rcd || !rcd->sc) {
-               ret = -EINVAL;
-               goto done;
-       }
-       sctxt = rcd->sc->hw_context;
-       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
-       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
-done:
-       return ret;
-}
-
-/*
- * Start doing the clean up the the chip. Our clean up happens in multiple
- * stages and this is just the first.
- */
-void hfi1_start_cleanup(struct hfi1_devdata *dd)
-{
-       aspm_exit(dd);
-       free_cntrs(dd);
-       free_rcverr(dd);
-       clean_up_interrupts(dd);
-       finish_chip_resources(dd);
-}
-
-#define HFI_BASE_GUID(dev) \
-       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
-
-/*
- * Information can be shared between the two HFIs on the same ASIC
- * in the same OS.  This function finds the peer device and sets
- * up a shared structure.
- */
-static int init_asic_data(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-       struct hfi1_devdata *tmp, *peer = NULL;
-       int ret = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       /* Find our peer device */
-       list_for_each_entry(tmp, &hfi1_dev_list, list) {
-               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
-                   dd->unit != tmp->unit) {
-                       peer = tmp;
-                       break;
-               }
-       }
-
-       if (peer) {
-               dd->asic_data = peer->asic_data;
-       } else {
-               dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
-               if (!dd->asic_data) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-               mutex_init(&dd->asic_data->asic_resource_mutex);
-       }
-       dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
-
-done:
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return ret;
-}
-
-/*
- * Set dd->boardname.  Use a generic name if a name is not returned from
- * EFI variable space.
- *
- * Return 0 on success, -ENOMEM if space could not be allocated.
- */
-static int obtain_boardname(struct hfi1_devdata *dd)
-{
-       /* generic board description */
-       const char generic[] =
-               "Intel Omni-Path Host Fabric Interface Adapter 100 Series";
-       unsigned long size;
-       int ret;
-
-       ret = read_hfi1_efi_var(dd, "description", &size,
-                               (void **)&dd->boardname);
-       if (ret) {
-               dd_dev_info(dd, "Board description not found\n");
-               /* use generic description */
-               dd->boardname = kstrdup(generic, GFP_KERNEL);
-               if (!dd->boardname)
-                       return -ENOMEM;
-       }
-       return 0;
-}
-
-/*
- * Check the interrupt registers to make sure that they are mapped correctly.
- * It is intended to help user identify any mismapping by VMM when the driver
- * is running in a VM. This function should only be called before interrupt
- * is set up properly.
- *
- * Return 0 on success, -EINVAL on failure.
- */
-static int check_int_registers(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       u64 all_bits = ~(u64)0;
-       u64 mask;
-
-       /* Clear CceIntMask[0] to avoid raising any interrupts */
-       mask = read_csr(dd, CCE_INT_MASK);
-       write_csr(dd, CCE_INT_MASK, 0ull);
-       reg = read_csr(dd, CCE_INT_MASK);
-       if (reg)
-               goto err_exit;
-
-       /* Clear all interrupt status bits */
-       write_csr(dd, CCE_INT_CLEAR, all_bits);
-       reg = read_csr(dd, CCE_INT_STATUS);
-       if (reg)
-               goto err_exit;
-
-       /* Set all interrupt status bits */
-       write_csr(dd, CCE_INT_FORCE, all_bits);
-       reg = read_csr(dd, CCE_INT_STATUS);
-       if (reg != all_bits)
-               goto err_exit;
-
-       /* Restore the interrupt mask */
-       write_csr(dd, CCE_INT_CLEAR, all_bits);
-       write_csr(dd, CCE_INT_MASK, mask);
-
-       return 0;
-err_exit:
-       write_csr(dd, CCE_INT_MASK, mask);
-       dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
-       return -EINVAL;
-}
-
-/**
- * Allocate and initialize the device structure for the hfi.
- * @dev: the pci_dev for hfi1_ib device
- * @ent: pci_device_id struct for this dev
- *
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
- * This is global, and is called directly at init to set up the
- * chip-specific function pointers for later use.
- */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-                                 const struct pci_device_id *ent)
-{
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       u64 reg;
-       int i, ret;
-       static const char * const inames[] = { /* implementation names */
-               "RTL silicon",
-               "RTL VCS simulation",
-               "RTL FPGA emulation",
-               "Functional simulator"
-       };
-       struct pci_dev *parent = pdev->bus->self;
-
-       dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
-                               sizeof(struct hfi1_pportdata));
-       if (IS_ERR(dd))
-               goto bail;
-       ppd = dd->pport;
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               int vl;
-               /* init common fields */
-               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
-               /* DC supports 4 link widths */
-               ppd->link_width_supported =
-                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
-                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
-               ppd->link_width_downgrade_supported =
-                       ppd->link_width_supported;
-               /* start out enabling only 4X */
-               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
-               ppd->link_width_downgrade_enabled =
-                                       ppd->link_width_downgrade_supported;
-               /* link width active is 0 when link is down */
-               /* link width downgrade active is 0 when link is down */
-
-               if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
-                   num_vls > HFI1_MAX_VLS_SUPPORTED) {
-                       hfi1_early_err(&pdev->dev,
-                                      "Invalid num_vls %u, using %u VLs\n",
-                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
-                       num_vls = HFI1_MAX_VLS_SUPPORTED;
-               }
-               ppd->vls_supported = num_vls;
-               ppd->vls_operational = ppd->vls_supported;
-               ppd->actual_vls_operational = ppd->vls_supported;
-               /* Set the default MTU. */
-               for (vl = 0; vl < num_vls; vl++)
-                       dd->vld[vl].mtu = hfi1_max_mtu;
-               dd->vld[15].mtu = MAX_MAD_PACKET;
-               /*
-                * Set the initial values to reasonable default, will be set
-                * for real when link is up.
-                */
-               ppd->lstate = IB_PORT_DOWN;
-               ppd->overrun_threshold = 0x4;
-               ppd->phy_error_threshold = 0xf;
-               ppd->port_crc_mode_enabled = link_crc_mask;
-               /* initialize supported LTP CRC mode */
-               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
-               /* initialize enabled LTP CRC mode */
-               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
-               /* start in offline */
-               ppd->host_link_state = HLS_DN_OFFLINE;
-               init_vl_arb_caches(ppd);
-               ppd->last_pstate = 0xff; /* invalid value */
-       }
-
-       dd->link_default = HLS_DN_POLL;
-
-       /*
-        * Do remaining PCIe setup and save PCIe values in dd.
-        * Any error printing is already done by the init code.
-        * On return, we have the chip mapped.
-        */
-       ret = hfi1_pcie_ddinit(dd, pdev, ent);
-       if (ret < 0)
-               goto bail_free;
-
-       /* verify that reads actually work, save revision for reset check */
-       dd->revision = read_csr(dd, CCE_REVISION);
-       if (dd->revision == ~(u64)0) {
-               dd_dev_err(dd, "cannot read chip CSRs\n");
-               ret = -EINVAL;
-               goto bail_cleanup;
-       }
-       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
-                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
-       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
-                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
-
-       /*
-        * Check interrupt registers mapping if the driver has no access to
-        * the upstream component. In this case, it is likely that the driver
-        * is running in a VM.
-        */
-       if (!parent) {
-               ret = check_int_registers(dd);
-               if (ret)
-                       goto bail_cleanup;
-       }
-
-       /*
-        * obtain the hardware ID - NOT related to unit, which is a
-        * software enumeration
-        */
-       reg = read_csr(dd, CCE_REVISION2);
-       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
-                                       & CCE_REVISION2_HFI_ID_MASK;
-       /* the variable size will remove unwanted bits */
-       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
-       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
-       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
-                   dd->icode < ARRAY_SIZE(inames) ?
-                   inames[dd->icode] : "unknown", (int)dd->irev);
-
-       /* speeds the hardware can support */
-       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
-       /* speeds allowed to run at */
-       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
-       /* give a reasonable active value, will be set on link up */
-       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
-
-       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
-       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
-       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
-       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
-       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
-       /* fix up link widths for emulation _p */
-       ppd = dd->pport;
-       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
-               ppd->link_width_supported =
-                       ppd->link_width_enabled =
-                       ppd->link_width_downgrade_supported =
-                       ppd->link_width_downgrade_enabled =
-                               OPA_LINK_WIDTH_1X;
-       }
-       /* insure num_vls isn't larger than number of sdma engines */
-       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
-               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
-                          num_vls, dd->chip_sdma_engines);
-               num_vls = dd->chip_sdma_engines;
-               ppd->vls_supported = dd->chip_sdma_engines;
-               ppd->vls_operational = ppd->vls_supported;
-       }
-
-       /*
-        * Convert the ns parameter to the 64 * cclocks used in the CSR.
-        * Limit the max if larger than the field holds.  If timeout is
-        * non-zero, then the calculated field will be at least 1.
-        *
-        * Must be after icode is set up - the cclock rate depends
-        * on knowing the hardware being used.
-        */
-       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
-       if (dd->rcv_intr_timeout_csr >
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
-               dd->rcv_intr_timeout_csr =
-                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
-       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
-               dd->rcv_intr_timeout_csr = 1;
-
-       /* needs to be done before we look for the peer device */
-       read_guid(dd);
-
-       /* set up shared ASIC data with peer device */
-       ret = init_asic_data(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* obtain chip sizes, reset chip CSRs */
-       init_chip(dd);
-
-       /* read in the PCIe link speed information */
-       ret = pcie_speeds(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* Needs to be called before hfi1_firmware_init */
-       get_platform_config(dd);
-
-       /* read in firmware */
-       ret = hfi1_firmware_init(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /*
-        * In general, the PCIe Gen3 transition must occur after the
-        * chip has been idled (so it won't initiate any PCIe transactions
-        * e.g. an interrupt) and before the driver changes any registers
-        * (the transition will reset the registers).
-        *
-        * In particular, place this call after:
-        * - init_chip()     - the chip will not initiate any PCIe transactions
-        * - pcie_speeds()   - reads the current link speed
-        * - hfi1_firmware_init() - the needed firmware is ready to be
-        *                          downloaded
-        */
-       ret = do_pcie_gen3_transition(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* start setting dd values and adjusting CSRs */
-       init_early_variables(dd);
-
-       parse_platform_config(dd);
-
-       ret = obtain_boardname(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       snprintf(dd->boardversion, BOARD_VERS_MAX,
-                "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
-                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
-                (u32)dd->majrev,
-                (u32)dd->minrev,
-                (dd->revision >> CCE_REVISION_SW_SHIFT)
-                   & CCE_REVISION_SW_MASK);
-
-       /*
-        * The real cpu mask is part of the affinity struct but has to be
-        * initialized earlier than the rest of the affinity struct because it
-        * is needed to calculate the number of user contexts in
-        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
-        * which initializes the rest of the affinity struct members,
-        * depends on set_up_context_variables() for the number of kernel
-        * contexts, so it cannot be called before set_up_context_variables().
-        */
-       ret = init_real_cpu_mask(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       ret = set_up_context_variables(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* set initial RXE CSRs */
-       init_rxe(dd);
-       /* set initial TXE CSRs */
-       init_txe(dd);
-       /* set initial non-RXE, non-TXE CSRs */
-       init_other(dd);
-       /* set up KDETH QP prefix in both RX and TX CSRs */
-       init_kdeth_qp(dd);
-
-       hfi1_dev_affinity_init(dd);
-
-       /* send contexts must be set up before receive contexts */
-       ret = init_send_contexts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       ret = hfi1_create_ctxts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
-       /*
-        * rcd[0] is guaranteed to be valid by this point. Also, all
-        * context are using the same value, as per the module parameter.
-        */
-       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
-
-       ret = init_pervl_scs(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* sdma init */
-       for (i = 0; i < dd->num_pports; ++i) {
-               ret = sdma_init(dd, i);
-               if (ret)
-                       goto bail_cleanup;
-       }
-
-       /* use contexts created by hfi1_create_ctxts */
-       ret = set_up_interrupts(dd);
-       if (ret)
-               goto bail_cleanup;
-
-       /* set up LCB access - must be after set_up_interrupts() */
-       init_lcb_access(dd);
-
-       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
-                dd->base_guid & 0xFFFFFF);
-
-       dd->oui1 = dd->base_guid >> 56 & 0xFF;
-       dd->oui2 = dd->base_guid >> 48 & 0xFF;
-       dd->oui3 = dd->base_guid >> 40 & 0xFF;
-
-       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
-       if (ret)
-               goto bail_clear_intr;
-       check_fabric_firmware_versions(dd);
-
-       thermal_init(dd);
-
-       ret = init_cntrs(dd);
-       if (ret)
-               goto bail_clear_intr;
-
-       ret = init_rcverr(dd);
-       if (ret)
-               goto bail_free_cntrs;
-
-       ret = eprom_init(dd);
-       if (ret)
-               goto bail_free_rcverr;
-
-       goto bail;
-
-bail_free_rcverr:
-       free_rcverr(dd);
-bail_free_cntrs:
-       free_cntrs(dd);
-bail_clear_intr:
-       clean_up_interrupts(dd);
-bail_cleanup:
-       hfi1_pcie_ddcleanup(dd);
-bail_free:
-       hfi1_free_devdata(dd);
-       dd = ERR_PTR(ret);
-bail:
-       return dd;
-}
-
-static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
-                       u32 dw_len)
-{
-       u32 delta_cycles;
-       u32 current_egress_rate = ppd->current_egress_rate;
-       /* rates here are in units of 10^6 bits/sec */
-
-       if (desired_egress_rate == -1)
-               return 0; /* shouldn't happen */
-
-       if (desired_egress_rate >= current_egress_rate)
-               return 0; /* we can't help go faster, only slower */
-
-       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
-                       egress_cycles(dw_len * 4, current_egress_rate);
-
-       return (u16)delta_cycles;
-}
-
-/**
- * create_pbc - build a pbc for transmission
- * @flags: special case flags or-ed in built pbc
- * @srate: static rate
- * @vl: vl
- * @dwlen: dword length (header words + data words + pbc words)
- *
- * Create a PBC with the given flags, rate, VL, and length.
- *
- * NOTE: The PBC created will not insert any HCRC - all callers but one are
- * for verbs, which does not use this PSM feature.  The lone other caller
- * is for the diagnostic interface which calls this if the user does not
- * supply their own PBC.
- */
-u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
-              u32 dw_len)
-{
-       u64 pbc, delay = 0;
-
-       if (unlikely(srate_mbs))
-               delay = delay_cycles(ppd, srate_mbs, dw_len);
-
-       pbc = flags
-               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
-               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
-               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
-               | (dw_len & PBC_LENGTH_DWS_MASK)
-                       << PBC_LENGTH_DWS_SHIFT;
-
-       return pbc;
-}
-
-#define SBUS_THERMAL    0x4f
-#define SBUS_THERM_MONITOR_MODE 0x1
-
-#define THERM_FAILURE(dev, ret, reason) \
-       dd_dev_err((dd),                                                \
-                  "Thermal sensor initialization failed: %s (%d)\n",   \
-                  (reason), (ret))
-
-/*
- * Initialize the Avago Thermal sensor.
- *
- * After initialization, enable polling of thermal sensor through
- * SBus interface. In order for this to work, the SBus Master
- * firmware has to be loaded due to the fact that the HW polling
- * logic uses SBus interrupts, which are not supported with
- * default firmware. Otherwise, no data will be returned through
- * the ASIC_STS_THERM CSR.
- */
-static int thermal_init(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       if (dd->icode != ICODE_RTL_SILICON ||
-           check_chip_resource(dd, CR_THERM_INIT, NULL))
-               return ret;
-
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Acquire SBus");
-               return ret;
-       }
-
-       dd_dev_info(dd, "Initializing thermal sensor\n");
-       /* Disable polling of thermal readings */
-       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
-       msleep(100);
-       /* Thermal Sensor Initialization */
-       /*    Step 1: Reset the Thermal SBus Receiver */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               RESET_SBUS_RECEIVER, 0);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Bus Reset");
-               goto done;
-       }
-       /*    Step 2: Set Reset bit in Thermal block */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               WRITE_SBUS_RECEIVER, 0x1);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Therm Block Reset");
-               goto done;
-       }
-       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
-                               WRITE_SBUS_RECEIVER, 0x32);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Clock Div");
-               goto done;
-       }
-       /*    Step 4: Select temperature mode */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
-                               WRITE_SBUS_RECEIVER,
-                               SBUS_THERM_MONITOR_MODE);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Mode Sel");
-               goto done;
-       }
-       /*    Step 5: De-assert block reset and start conversion */
-       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
-                               WRITE_SBUS_RECEIVER, 0x2);
-       if (ret) {
-               THERM_FAILURE(dd, ret, "Write Reset Deassert");
-               goto done;
-       }
-       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
-       msleep(22);
-
-       /* Enable polling of thermal readings */
-       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
-
-       /* Set initialized flag */
-       ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
-       if (ret)
-               THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
-
-done:
-       release_chip_resource(dd, CR_SBUS);
-       return ret;
-}
-
-static void handle_temp_err(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = &dd->pport[0];
-       /*
-        * Thermal Critical Interrupt
-        * Put the device into forced freeze mode, take link down to
-        * offline, and put DC into reset.
-        */
-       dd_dev_emerg(dd,
-                    "Critical temperature reached! Forcing device into freeze mode!\n");
-       dd->flags |= HFI1_FORCED_FREEZE;
-       start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
-       /*
-        * Shut DC down as much and as quickly as possible.
-        *
-        * Step 1: Take the link down to OFFLINE. This will cause the
-        *         8051 to put the Serdes in reset. However, we don't want to
-        *         go through the entire link state machine since we want to
-        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
-        *         but rather an attempt to save the chip.
-        *         Code below is almost the same as quiet_serdes() but avoids
-        *         all the extra work and the sleeps.
-        */
-       ppd->driver_link_ready = 0;
-       ppd->link_enabled = 0;
-       set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
-                               PLS_OFFLINE);
-       /*
-        * Step 2: Shutdown LCB and 8051
-        *         After shutdown, do not restore DC_CFG_RESET value.
-        */
-       dc_shutdown(dd);
-}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
deleted file mode 100644 (file)
index 1948706..0000000
+++ /dev/null
@@ -1,1368 +0,0 @@
-#ifndef _CHIP_H
-#define _CHIP_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains all of the defines that is specific to the HFI chip
- */
-
-/* sizes */
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
-#define NUM_INTERRUPT_SOURCES 768
-#define RXE_NUM_CONTEXTS 160
-#define RXE_PER_CONTEXT_SIZE 0x1000    /* 4k */
-#define RXE_NUM_TID_FLOWS 32
-#define RXE_NUM_DATA_VL 8
-#define TXE_NUM_CONTEXTS 160
-#define TXE_NUM_SDMA_ENGINES 16
-#define NUM_CONTEXTS_PER_SET 8
-#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
-#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
-#define VL_ARB_TABLE_SIZE 16
-#define TXE_NUM_32_BIT_COUNTER 7
-#define TXE_NUM_64_BIT_COUNTER 30
-#define TXE_NUM_DATA_VL 8
-#define TXE_PIO_SIZE (32 * 0x100000)   /* 32 MB */
-#define PIO_BLOCK_SIZE 64                      /* bytes */
-#define SDMA_BLOCK_SIZE 64                     /* bytes */
-#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
-#define PIO_CMASK 0x7ff        /* counter mask for free and fill counters */
-#define MAX_EAGER_ENTRIES    2048      /* max receive eager entries */
-#define MAX_TID_PAIR_ENTRIES 1024      /* max receive expected pairs */
-/*
- * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
- * at 64 bytes for all generation one devices
- */
-#define CM_VAU 3
-/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
-#define CM_GLOBAL_CREDITS 0x940
-/* Number of PKey entries in the HW */
-#define MAX_PKEY_VALUES 16
-
-#include "chip_registers.h"
-
-#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
-#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
-
-/* PBC flags */
-#define PBC_INTR               BIT_ULL(31)
-#define PBC_DC_INFO_SHIFT      (30)
-#define PBC_DC_INFO            BIT_ULL(PBC_DC_INFO_SHIFT)
-#define PBC_TEST_EBP           BIT_ULL(29)
-#define PBC_PACKET_BYPASS      BIT_ULL(28)
-#define PBC_CREDIT_RETURN      BIT_ULL(25)
-#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24)
-#define PBC_TEST_BAD_ICRC      BIT_ULL(23)
-#define PBC_FECN               BIT_ULL(22)
-
-/* PbcInsertHcrc field settings */
-#define PBC_IHCRC_LKDETH 0x0   /* insert @ local KDETH offset */
-#define PBC_IHCRC_GKDETH 0x1   /* insert @ global KDETH offset */
-#define PBC_IHCRC_NONE   0x2   /* no HCRC inserted */
-
-/* PBC fields */
-#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
-#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
-#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
-       (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
-       PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
-
-#define PBC_INSERT_HCRC_SHIFT 26
-#define PBC_INSERT_HCRC_MASK 0x3ull
-#define PBC_INSERT_HCRC_SMASK \
-       (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
-
-#define PBC_VL_SHIFT 12
-#define PBC_VL_MASK 0xfull
-#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
-
-#define PBC_LENGTH_DWS_SHIFT 0
-#define PBC_LENGTH_DWS_MASK 0xfffull
-#define PBC_LENGTH_DWS_SMASK \
-       (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
-
-/* Credit Return Fields */
-#define CR_COUNTER_SHIFT 0
-#define CR_COUNTER_MASK 0x7ffull
-#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
-
-#define CR_STATUS_SHIFT 11
-#define CR_STATUS_MASK 0x1ull
-#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
-#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
-#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
-
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
-#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
-       (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
-       CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
-
-/* interrupt source numbers */
-#define IS_GENERAL_ERR_START     0
-#define IS_SDMAENG_ERR_START    16
-#define IS_SENDCTXT_ERR_START   32
-#define IS_SDMA_START          192 /* includes SDmaProgress,SDmaIdle */
-#define IS_VARIOUS_START               240
-#define IS_DC_START                    248
-#define IS_RCVAVAIL_START              256
-#define IS_RCVURGENT_START             416
-#define IS_SENDCREDIT_START            576
-#define IS_RESERVED_START              736
-#define IS_MAX_SOURCES         768
-
-/* derived interrupt source values */
-#define IS_GENERAL_ERR_END             IS_SDMAENG_ERR_START
-#define IS_SDMAENG_ERR_END             IS_SENDCTXT_ERR_START
-#define IS_SENDCTXT_ERR_END            IS_SDMA_START
-#define IS_SDMA_END                    IS_VARIOUS_START
-#define IS_VARIOUS_END         IS_DC_START
-#define IS_DC_END                      IS_RCVAVAIL_START
-#define IS_RCVAVAIL_END                IS_RCVURGENT_START
-#define IS_RCVURGENT_END               IS_SENDCREDIT_START
-#define IS_SENDCREDIT_END              IS_RESERVED_START
-#define IS_RESERVED_END                IS_MAX_SOURCES
-
-/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
-#define QSFP1_INT              242
-#define QSFP2_INT              243
-
-/* DCC_CFG_PORT_CONFIG logical link states */
-#define LSTATE_DOWN    0x1
-#define LSTATE_INIT    0x2
-#define LSTATE_ARMED   0x3
-#define LSTATE_ACTIVE  0x4
-
-/* DC8051_STS_CUR_STATE port values (physical link states) */
-#define PLS_DISABLED                      0x30
-#define PLS_OFFLINE                               0x90
-#define PLS_OFFLINE_QUIET                         0x90
-#define PLS_OFFLINE_PLANNED_DOWN_INFORM           0x91
-#define PLS_OFFLINE_READY_TO_QUIET_LT     0x92
-#define PLS_OFFLINE_REPORT_FAILURE                0x93
-#define PLS_OFFLINE_READY_TO_QUIET_BCC    0x94
-#define PLS_POLLING                               0x20
-#define PLS_POLLING_QUIET                         0x20
-#define PLS_POLLING_ACTIVE                        0x21
-#define PLS_CONFIGPHY                     0x40
-#define PLS_CONFIGPHY_DEBOUCE             0x40
-#define PLS_CONFIGPHY_ESTCOMM             0x41
-#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT           0x42
-#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE   0x43
-#define PLS_CONFIGPHY_OPTEQ                       0x44
-#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING    0x44
-#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE        0x45
-#define PLS_CONFIGPHY_VERIFYCAP                   0x46
-#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE          0x46
-#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
-#define PLS_CONFIGLT                      0x48
-#define PLS_CONFIGLT_CONFIGURE            0x48
-#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE         0x49
-#define PLS_LINKUP                                0x50
-#define PLS_PHYTEST                               0xB0
-#define PLS_INTERNAL_SERDES_LOOPBACK      0xe1
-#define PLS_QUICK_LINKUP                          0xe2
-
-/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
-#define HCMD_LOAD_CONFIG_DATA  0x01
-#define HCMD_READ_CONFIG_DATA  0x02
-#define HCMD_CHANGE_PHY_STATE  0x03
-#define HCMD_SEND_LCB_IDLE_MSG 0x04
-#define HCMD_MISC                 0x05
-#define HCMD_READ_LCB_IDLE_MSG 0x06
-#define HCMD_READ_LCB_CSR      0x07
-#define HCMD_WRITE_LCB_CSR     0x08
-#define HCMD_INTERFACE_TEST       0xff
-
-/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
-#define HCMD_SUCCESS 2
-
-/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
-#define SPICO_ROM_FAILED               BIT(0)
-#define UNKNOWN_FRAME                  BIT(1)
-#define TARGET_BER_NOT_MET             BIT(2)
-#define FAILED_SERDES_INTERNAL_LOOPBACK        BIT(3)
-#define FAILED_SERDES_INIT             BIT(4)
-#define FAILED_LNI_POLLING             BIT(5)
-#define FAILED_LNI_DEBOUNCE            BIT(6)
-#define FAILED_LNI_ESTBCOMM            BIT(7)
-#define FAILED_LNI_OPTEQ               BIT(8)
-#define FAILED_LNI_VERIFY_CAP1         BIT(9)
-#define FAILED_LNI_VERIFY_CAP2         BIT(10)
-#define FAILED_LNI_CONFIGLT            BIT(11)
-#define HOST_HANDSHAKE_TIMEOUT         BIT(12)
-
-#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
-                       | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
-                       | FAILED_LNI_VERIFY_CAP1 \
-                       | FAILED_LNI_VERIFY_CAP2 \
-                       | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT)
-
-/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
-#define HOST_REQ_DONE          BIT(0)
-#define BC_PWR_MGM_MSG         BIT(1)
-#define BC_SMA_MSG             BIT(2)
-#define BC_BCC_UNKNOWN_MSG     BIT(3)
-#define BC_IDLE_UNKNOWN_MSG    BIT(4)
-#define EXT_DEVICE_CFG_REQ     BIT(5)
-#define VERIFY_CAP_FRAME       BIT(6)
-#define LINKUP_ACHIEVED                BIT(7)
-#define LINK_GOING_DOWN                BIT(8)
-#define LINK_WIDTH_DOWNGRADED  BIT(9)
-
-/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
-#define HREQ_LOAD_CONFIG       0x01
-#define HREQ_SAVE_CONFIG       0x02
-#define HREQ_READ_CONFIG       0x03
-#define HREQ_SET_TX_EQ_ABS     0x04
-#define HREQ_SET_TX_EQ_REL     0x05
-#define HREQ_ENABLE            0x06
-#define HREQ_CONFIG_DONE       0xfe
-#define HREQ_INTERFACE_TEST    0xff
-
-/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
-#define HREQ_INVALID           0x01
-#define HREQ_SUCCESS           0x02
-#define HREQ_NOT_SUPPORTED             0x03
-#define HREQ_FEATURE_NOT_SUPPORTED     0x04 /* request specific feature */
-#define HREQ_REQUEST_REJECTED  0xfe
-#define HREQ_EXECUTION_ONGOING 0xff
-
-/* MISC host command functions */
-#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
-#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
-
-/* idle flit message types */
-#define IDLE_PHYSICAL_LINK_MGMT 0x1
-#define IDLE_CRU                   0x2
-#define IDLE_SMA                   0x3
-#define IDLE_POWER_MGMT            0x4
-
-/* idle flit message send fields (both send and read) */
-#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
-#define IDLE_PAYLOAD_SHIFT 8
-#define IDLE_MSG_TYPE_MASK 0xf
-#define IDLE_MSG_TYPE_SHIFT 0
-
-/* idle flit message read fields */
-#define READ_IDLE_MSG_TYPE_MASK 0xf
-#define READ_IDLE_MSG_TYPE_SHIFT 0
-
-/* SMA idle flit payload commands */
-#define SMA_IDLE_ARM   1
-#define SMA_IDLE_ACTIVE 2
-
-/* DC_DC8051_CFG_MODE.GENERAL bits */
-#define DISABLE_SELF_GUID_CHECK 0x2
-
-/*
- * Eager buffer minimum and maximum sizes supported by the hardware.
- * All power-of-two sizes in between are supported as well.
- * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
- * allocatable for Eager buffer to a single context. All others
- * are limits for the RcvArray entries.
- */
-#define MIN_EAGER_BUFFER       (4 * 1024)
-#define MAX_EAGER_BUFFER       (256 * 1024)
-#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
-#define MAX_EXPECTED_BUFFER    (2048 * 1024)
-
-/*
- * Receive expected base and count and eager base and count increment -
- * the CSR fields hold multiples of this value.
- */
-#define RCV_SHIFT 3
-#define RCV_INCREMENT BIT(RCV_SHIFT)
-
-/*
- * Receive header queue entry increment - the CSR holds multiples of
- * this value.
- */
-#define HDRQ_SIZE_SHIFT 5
-#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
-
-/*
- * Freeze handling flags
- */
-#define FREEZE_ABORT     0x01  /* do not do recovery */
-#define FREEZE_SELF         0x02       /* initiate the freeze */
-#define FREEZE_LINK_DOWN 0x04  /* link is down */
-
-/*
- * Chip implementation codes.
- */
-#define ICODE_RTL_SILICON              0x00
-#define ICODE_RTL_VCS_SIMULATION       0x01
-#define ICODE_FPGA_EMULATION   0x02
-#define ICODE_FUNCTIONAL_SIMULATOR     0x03
-
-/*
- * 8051 data memory size.
- */
-#define DC8051_DATA_MEM_SIZE 0x1000
-
-/*
- * 8051 firmware registers
- */
-#define NUM_GENERAL_FIELDS 0x17
-#define NUM_LANE_FIELDS    0x8
-
-/* 8051 general register Field IDs */
-#define LINK_OPTIMIZATION_SETTINGS   0x00
-#define LINK_TUNING_PARAMETERS      0x02
-#define DC_HOST_COMM_SETTINGS       0x03
-#define TX_SETTINGS                 0x06
-#define VERIFY_CAP_LOCAL_PHY        0x07
-#define VERIFY_CAP_LOCAL_FABRIC             0x08
-#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
-#define LOCAL_DEVICE_ID                     0x0a
-#define LOCAL_LNI_INFO              0x0c
-#define REMOTE_LNI_INFO              0x0d
-#define MISC_STATUS                 0x0e
-#define VERIFY_CAP_REMOTE_PHY       0x0f
-#define VERIFY_CAP_REMOTE_FABRIC     0x10
-#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
-#define LAST_LOCAL_STATE_COMPLETE    0x12
-#define LAST_REMOTE_STATE_COMPLETE   0x13
-#define LINK_QUALITY_INFO            0x14
-#define REMOTE_DEVICE_ID            0x15
-#define LINK_DOWN_REASON            0x16
-
-/* 8051 lane specific register field IDs */
-#define TX_EQ_SETTINGS         0x00
-#define CHANNEL_LOSS_SETTINGS  0x05
-
-/* Lane ID for general configuration registers */
-#define GENERAL_CONFIG 4
-
-/* LOAD_DATA 8051 command shifts and fields */
-#define LOAD_DATA_FIELD_ID_SHIFT 40
-#define LOAD_DATA_FIELD_ID_MASK 0xfull
-#define LOAD_DATA_LANE_ID_SHIFT 32
-#define LOAD_DATA_LANE_ID_MASK 0xfull
-#define LOAD_DATA_DATA_SHIFT   0x0
-#define LOAD_DATA_DATA_MASK   0xffffffffull
-
-/* READ_DATA 8051 command shifts and fields */
-#define READ_DATA_FIELD_ID_SHIFT 40
-#define READ_DATA_FIELD_ID_MASK 0xffull
-#define READ_DATA_LANE_ID_SHIFT 32
-#define READ_DATA_LANE_ID_MASK 0xffull
-#define READ_DATA_DATA_SHIFT   0x0
-#define READ_DATA_DATA_MASK   0xffffffffull
-
-/* TX settings fields */
-#define ENABLE_LANE_TX_SHIFT           0
-#define ENABLE_LANE_TX_MASK            0xff
-#define TX_POLARITY_INVERSION_SHIFT    8
-#define TX_POLARITY_INVERSION_MASK     0xff
-#define RX_POLARITY_INVERSION_SHIFT    16
-#define RX_POLARITY_INVERSION_MASK     0xff
-#define MAX_RATE_SHIFT                 24
-#define MAX_RATE_MASK                  0xff
-
-/* verify capability PHY fields */
-#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
-#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK  0x1
-#define POWER_MANAGEMENT_SHIFT                 0x0
-#define POWER_MANAGEMENT_MASK                  0xf
-
-/* 8051 lane register Field IDs */
-#define SPICO_FW_VERSION 0x7   /* SPICO firmware version */
-
-/* SPICO firmware version fields */
-#define SPICO_ROM_VERSION_SHIFT 0
-#define SPICO_ROM_VERSION_MASK 0xffff
-#define SPICO_ROM_PROD_ID_SHIFT 16
-#define SPICO_ROM_PROD_ID_MASK 0xffff
-
-/* verify capability fabric fields */
-#define VAU_SHIFT      0
-#define VAU_MASK       0x0007
-#define Z_SHIFT                3
-#define Z_MASK         0x0001
-#define VCU_SHIFT      4
-#define VCU_MASK       0x0007
-#define VL15BUF_SHIFT  8
-#define VL15BUF_MASK   0x0fff
-#define CRC_SIZES_SHIFT 20
-#define CRC_SIZES_MASK 0x7
-
-/* verify capability local link width fields */
-#define LINK_WIDTH_SHIFT 0             /* also for remote link width */
-#define LINK_WIDTH_MASK 0xffff         /* also for remote link width */
-#define LOCAL_FLAG_BITS_SHIFT 16
-#define LOCAL_FLAG_BITS_MASK 0xff
-#define MISC_CONFIG_BITS_SHIFT 24
-#define MISC_CONFIG_BITS_MASK 0xff
-
-/* verify capability remote link width fields */
-#define REMOTE_TX_RATE_SHIFT 16
-#define REMOTE_TX_RATE_MASK 0xff
-
-/* LOCAL_DEVICE_ID fields */
-#define LOCAL_DEVICE_REV_SHIFT 0
-#define LOCAL_DEVICE_REV_MASK 0xff
-#define LOCAL_DEVICE_ID_SHIFT 8
-#define LOCAL_DEVICE_ID_MASK 0xffff
-
-/* REMOTE_DEVICE_ID fields */
-#define REMOTE_DEVICE_REV_SHIFT 0
-#define REMOTE_DEVICE_REV_MASK 0xff
-#define REMOTE_DEVICE_ID_SHIFT 8
-#define REMOTE_DEVICE_ID_MASK 0xffff
-
-/* local LNI link width fields */
-#define ENABLE_LANE_RX_SHIFT 16
-#define ENABLE_LANE_RX_MASK  0xff
-
-/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
-#define MGMT_ALLOWED_SHIFT 23
-#define MGMT_ALLOWED_MASK 0x1
-
-/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
-#define LINK_QUALITY_SHIFT 24
-#define LINK_QUALITY_MASK  0x7
-
-/*
- * mask, shift for reading 'planned_down_remote_reason_code'
- * from LINK_QUALITY_INFO field
- */
-#define DOWN_REMOTE_REASON_SHIFT 16
-#define DOWN_REMOTE_REASON_MASK  0xff
-
-/* verify capability PHY power management bits */
-#define PWRM_BER_CONTROL       0x1
-#define PWRM_BANDWIDTH_CONTROL 0x2
-
-/* 8051 link down reasons */
-#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
-#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
-#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
-
-/* verify capability fabric CRC size bits */
-enum {
-       CAP_CRC_14B = (1 << 0), /* 14b CRC */
-       CAP_CRC_48B = (1 << 1), /* 48b CRC */
-       CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
-};
-
-#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
-
-/* misc status version fields */
-#define STS_FM_VERSION_A_SHIFT 16
-#define STS_FM_VERSION_A_MASK  0xff
-#define STS_FM_VERSION_B_SHIFT 24
-#define STS_FM_VERSION_B_MASK  0xff
-
-/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
-#define LCB_CRC_16B                    0x0     /* 16b CRC */
-#define LCB_CRC_14B                    0x1     /* 14b CRC */
-#define LCB_CRC_48B                    0x2     /* 48b CRC */
-#define LCB_CRC_12B_16B_PER_LANE       0x3     /* 12b-16b per lane CRC */
-
-/*
- * the following enum is (almost) a copy/paste of the definition
- * in the OPA spec, section 20.2.2.6.8 (PortInfo)
- */
-enum {
-       PORT_LTP_CRC_MODE_NONE = 0,
-       PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
-       PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
-       PORT_LTP_CRC_MODE_48 = 4,
-               /* 48-bit overlapping LTP CRC mode (optional) */
-       PORT_LTP_CRC_MODE_PER_LANE = 8
-               /* 12 to 16 bit per lane LTP CRC mode (optional) */
-};
-
-/* timeouts */
-#define LINK_RESTART_DELAY 1000                /* link restart delay, in ms */
-#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
-#define DC8051_COMMAND_TIMEOUT 20000   /* DC8051 command timeout, in ms */
-#define FREEZE_STATUS_TIMEOUT 20       /* wait for freeze indicators, in ms */
-#define VL_STATUS_CLEAR_TIMEOUT 5000   /* per-VL status clear, in ms */
-#define CCE_STATUS_TIMEOUT 10          /* time to clear CCE Status, in ms */
-
-/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
-#define ASIC_CCLOCK_PS  1242   /* 805 MHz */
-#define FPGA_CCLOCK_PS 30300   /*  33 MHz */
-
-/*
- * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
- * see firmware.c:run_rsa() for details.
- */
-#define DRIVER_MISC_MASK \
-       (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
-               | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
-
-/* valid values for the loopback module parameter */
-#define LOOPBACK_NONE  0       /* no loopback - default */
-#define LOOPBACK_SERDES 1
-#define LOOPBACK_LCB   2
-#define LOOPBACK_CABLE 3       /* external cable */
-
-/* read and write hardware registers */
-u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
-void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
-
-/*
- * The *_kctxt_* flavor of the CSR read/write functions are for
- * per-context or per-SDMA CSRs that are not mappable to user-space.
- * Their spacing is not a PAGE_SIZE multiple.
- */
-static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
-                                u32 offset0)
-{
-       /* kernel per-context CSRs are separated by 0x100 */
-       return read_csr(dd, offset0 + (0x100 * ctxt));
-}
-
-static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
-                                  u32 offset0, u64 value)
-{
-       /* kernel per-context CSRs are separated by 0x100 */
-       write_csr(dd, offset0 + (0x100 * ctxt), value);
-}
-
-int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
-int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
-
-void __iomem *get_csr_addr(
-       struct hfi1_devdata *dd,
-       u32 offset);
-
-static inline void __iomem *get_kctxt_csr_addr(
-       struct hfi1_devdata *dd,
-       int ctxt,
-       u32 offset0)
-{
-       return get_csr_addr(dd, offset0 + (0x100 * ctxt));
-}
-
-/*
- * The *_uctxt_* flavor of the CSR read/write functions are for
- * per-context CSRs that are mappable to user space. All these CSRs
- * are spaced by a PAGE_SIZE multiple in order to be mappable to
- * different processes without exposing other contexts' CSRs
- */
-static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
-                                u32 offset0)
-{
-       /* user per-context CSRs are separated by 0x1000 */
-       return read_csr(dd, offset0 + (0x1000 * ctxt));
-}
-
-static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
-                                  u32 offset0, u64 value)
-{
-       /* user per-context CSRs are separated by 0x1000 */
-       write_csr(dd, offset0 + (0x1000 * ctxt), value);
-}
-
-u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
-
-/* firmware.c */
-#define SBUS_MASTER_BROADCAST 0xfd
-#define NUM_PCIE_SERDES 16     /* number of PCIe serdes on the SBus */
-extern const u8 pcie_serdes_broadcast[];
-extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
-extern uint platform_config_load;
-
-/* SBus commands */
-#define RESET_SBUS_RECEIVER 0x20
-#define WRITE_SBUS_RECEIVER 0x21
-void sbus_request(struct hfi1_devdata *dd,
-                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
-int sbus_request_slow(struct hfi1_devdata *dd,
-                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
-void set_sbus_fast_mode(struct hfi1_devdata *dd);
-void clear_sbus_fast_mode(struct hfi1_devdata *dd);
-int hfi1_firmware_init(struct hfi1_devdata *dd);
-int load_pcie_firmware(struct hfi1_devdata *dd);
-int load_firmware(struct hfi1_devdata *dd);
-void dispose_firmware(void);
-int acquire_hw_mutex(struct hfi1_devdata *dd);
-void release_hw_mutex(struct hfi1_devdata *dd);
-
-/*
- * Bitmask of dynamic access for ASIC block chip resources.  Each HFI has its
- * own range of bits for the resource so it can clear its own bits on
- * starting and exiting.  If either HFI has the resource bit set, the
- * resource is in use.  The separate bit ranges are:
- *     HFI0 bits  7:0
- *     HFI1 bits 15:8
- */
-#define CR_SBUS  0x01  /* SBUS, THERM, and PCIE registers */
-#define CR_EPROM 0x02  /* EEP, GPIO registers */
-#define CR_I2C1  0x04  /* QSFP1_OE register */
-#define CR_I2C2  0x08  /* QSFP2_OE register */
-#define CR_DYN_SHIFT 8 /* dynamic flag shift */
-#define CR_DYN_MASK  ((1ull << CR_DYN_SHIFT) - 1)
-
-/*
- * Bitmask of static ASIC states these are outside of the dynamic ASIC
- * block chip resources above.  These are to be set once and never cleared.
- * Must be holding the SBus dynamic flag when setting.
- */
-#define CR_THERM_INIT  0x010000
-
-int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
-void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
-bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
-                        const char *func);
-void init_chip_resources(struct hfi1_devdata *dd);
-void finish_chip_resources(struct hfi1_devdata *dd);
-
-/* ms wait time for access to an SBus resoure */
-#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
-
-/* ms wait time for a qsfp (i2c) chain to become available */
-#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
-
-void fabric_serdes_reset(struct hfi1_devdata *dd);
-int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
-
-/* chip.c */
-void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
-void read_guid(struct hfi1_devdata *dd);
-int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
-void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
-                         u8 neigh_reason, u8 rem_reason);
-int set_link_state(struct hfi1_pportdata *, u32 state);
-int port_ltp_to_cap(int port_ltp);
-void handle_verify_cap(struct work_struct *work);
-void handle_freeze(struct work_struct *work);
-void handle_link_up(struct work_struct *work);
-void handle_link_down(struct work_struct *work);
-void handle_link_downgrade(struct work_struct *work);
-void handle_link_bounce(struct work_struct *work);
-void handle_sma_message(struct work_struct *work);
-void reset_qsfp(struct hfi1_pportdata *ppd);
-void qsfp_event(struct work_struct *work);
-void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
-int send_idle_sma(struct hfi1_devdata *dd, u64 message);
-int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
-int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
-int start_link(struct hfi1_pportdata *ppd);
-int bringup_serdes(struct hfi1_pportdata *ppd);
-void set_intr_state(struct hfi1_devdata *dd, u32 enable);
-void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
-                                int refresh_widths);
-void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
-int stop_drain_data_vls(struct hfi1_devdata *dd);
-int open_fill_data_vls(struct hfi1_devdata *dd);
-u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
-u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
-void get_linkup_link_widths(struct hfi1_pportdata *ppd);
-void read_ltp_rtt(struct hfi1_devdata *dd);
-void clear_linkup_counters(struct hfi1_devdata *dd);
-u32 hdrqempty(struct hfi1_ctxtdata *rcd);
-int is_ax(struct hfi1_devdata *dd);
-int is_bx(struct hfi1_devdata *dd);
-u32 read_physical_state(struct hfi1_devdata *dd);
-u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
-u32 get_logical_state(struct hfi1_pportdata *ppd);
-const char *opa_lstate_name(u32 lstate);
-const char *opa_pstate_name(u32 pstate);
-u32 driver_physical_state(struct hfi1_pportdata *ppd);
-u32 driver_logical_state(struct hfi1_pportdata *ppd);
-
-int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
-int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
-#define LCB_START DC_LCB_CSRS
-#define LCB_END   DC_8051_CSRS /* next block is 8051 */
-static inline int is_lcb_offset(u32 offset)
-{
-       return (offset >= LCB_START && offset < LCB_END);
-}
-
-extern uint num_vls;
-
-extern uint disable_integrity;
-u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
-u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
-u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
-u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
-u32 read_logical_state(struct hfi1_devdata *dd);
-void force_recv_intr(struct hfi1_ctxtdata *rcd);
-
-/* Per VL indexes */
-enum {
-       C_VL_0 = 0,
-       C_VL_1,
-       C_VL_2,
-       C_VL_3,
-       C_VL_4,
-       C_VL_5,
-       C_VL_6,
-       C_VL_7,
-       C_VL_15,
-       C_VL_COUNT
-};
-
-static inline int vl_from_idx(int idx)
-{
-       return (idx == C_VL_15 ? 15 : idx);
-}
-
-static inline int idx_from_vl(int vl)
-{
-       return (vl == 15 ? C_VL_15 : vl);
-}
-
-/* Per device counter indexes */
-enum {
-       C_RCV_OVF = 0,
-       C_RX_TID_FULL,
-       C_RX_TID_INVALID,
-       C_RX_TID_FLGMS,
-       C_RX_CTX_EGRS,
-       C_RCV_TID_FLSMS,
-       C_CCE_PCI_CR_ST,
-       C_CCE_PCI_TR_ST,
-       C_CCE_PIO_WR_ST,
-       C_CCE_ERR_INT,
-       C_CCE_SDMA_INT,
-       C_CCE_MISC_INT,
-       C_CCE_RCV_AV_INT,
-       C_CCE_RCV_URG_INT,
-       C_CCE_SEND_CR_INT,
-       C_DC_UNC_ERR,
-       C_DC_RCV_ERR,
-       C_DC_FM_CFG_ERR,
-       C_DC_RMT_PHY_ERR,
-       C_DC_DROPPED_PKT,
-       C_DC_MC_XMIT_PKTS,
-       C_DC_MC_RCV_PKTS,
-       C_DC_XMIT_CERR,
-       C_DC_RCV_CERR,
-       C_DC_RCV_FCC,
-       C_DC_XMIT_FCC,
-       C_DC_XMIT_FLITS,
-       C_DC_RCV_FLITS,
-       C_DC_XMIT_PKTS,
-       C_DC_RCV_PKTS,
-       C_DC_RX_FLIT_VL,
-       C_DC_RX_PKT_VL,
-       C_DC_RCV_FCN,
-       C_DC_RCV_FCN_VL,
-       C_DC_RCV_BCN,
-       C_DC_RCV_BCN_VL,
-       C_DC_RCV_BBL,
-       C_DC_RCV_BBL_VL,
-       C_DC_MARK_FECN,
-       C_DC_MARK_FECN_VL,
-       C_DC_TOTAL_CRC,
-       C_DC_CRC_LN0,
-       C_DC_CRC_LN1,
-       C_DC_CRC_LN2,
-       C_DC_CRC_LN3,
-       C_DC_CRC_MULT_LN,
-       C_DC_TX_REPLAY,
-       C_DC_RX_REPLAY,
-       C_DC_SEQ_CRC_CNT,
-       C_DC_ESC0_ONLY_CNT,
-       C_DC_ESC0_PLUS1_CNT,
-       C_DC_ESC0_PLUS2_CNT,
-       C_DC_REINIT_FROM_PEER_CNT,
-       C_DC_SBE_CNT,
-       C_DC_MISC_FLG_CNT,
-       C_DC_PRF_GOOD_LTP_CNT,
-       C_DC_PRF_ACCEPTED_LTP_CNT,
-       C_DC_PRF_RX_FLIT_CNT,
-       C_DC_PRF_TX_FLIT_CNT,
-       C_DC_PRF_CLK_CNTR,
-       C_DC_PG_DBG_FLIT_CRDTS_CNT,
-       C_DC_PG_STS_PAUSE_COMPLETE_CNT,
-       C_DC_PG_STS_TX_SBE_CNT,
-       C_DC_PG_STS_TX_MBE_CNT,
-       C_SW_CPU_INTR,
-       C_SW_CPU_RCV_LIM,
-       C_SW_VTX_WAIT,
-       C_SW_PIO_WAIT,
-       C_SW_PIO_DRAIN,
-       C_SW_KMEM_WAIT,
-       C_SW_SEND_SCHED,
-       C_SDMA_DESC_FETCHED_CNT,
-       C_SDMA_INT_CNT,
-       C_SDMA_ERR_CNT,
-       C_SDMA_IDLE_INT_CNT,
-       C_SDMA_PROGRESS_INT_CNT,
-/* MISC_ERR_STATUS */
-       C_MISC_PLL_LOCK_FAIL_ERR,
-       C_MISC_MBIST_FAIL_ERR,
-       C_MISC_INVALID_EEP_CMD_ERR,
-       C_MISC_EFUSE_DONE_PARITY_ERR,
-       C_MISC_EFUSE_WRITE_ERR,
-       C_MISC_EFUSE_READ_BAD_ADDR_ERR,
-       C_MISC_EFUSE_CSR_PARITY_ERR,
-       C_MISC_FW_AUTH_FAILED_ERR,
-       C_MISC_KEY_MISMATCH_ERR,
-       C_MISC_SBUS_WRITE_FAILED_ERR,
-       C_MISC_CSR_WRITE_BAD_ADDR_ERR,
-       C_MISC_CSR_READ_BAD_ADDR_ERR,
-       C_MISC_CSR_PARITY_ERR,
-/* CceErrStatus */
-       /*
-       * A special counter that is the aggregate count
-       * of all the cce_err_status errors.  The remainder
-       * are actual bits in the CceErrStatus register.
-       */
-       C_CCE_ERR_STATUS_AGGREGATED_CNT,
-       C_CCE_MSIX_CSR_PARITY_ERR,
-       C_CCE_INT_MAP_UNC_ERR,
-       C_CCE_INT_MAP_COR_ERR,
-       C_CCE_MSIX_TABLE_UNC_ERR,
-       C_CCE_MSIX_TABLE_COR_ERR,
-       C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
-       C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_SEG_WRITE_BAD_ADDR_ERR,
-       C_CCE_SEG_READ_BAD_ADDR_ERR,
-       C_LA_TRIGGERED,
-       C_CCE_TRGT_CPL_TIMEOUT_ERR,
-       C_PCIC_RECEIVE_PARITY_ERR,
-       C_PCIC_TRANSMIT_BACK_PARITY_ERR,
-       C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
-       C_PCIC_CPL_DAT_Q_UNC_ERR,
-       C_PCIC_CPL_HD_Q_UNC_ERR,
-       C_PCIC_POST_DAT_Q_UNC_ERR,
-       C_PCIC_POST_HD_Q_UNC_ERR,
-       C_PCIC_RETRY_SOT_MEM_UNC_ERR,
-       C_PCIC_RETRY_MEM_UNC_ERR,
-       C_PCIC_N_POST_DAT_Q_PARITY_ERR,
-       C_PCIC_N_POST_H_Q_PARITY_ERR,
-       C_PCIC_CPL_DAT_Q_COR_ERR,
-       C_PCIC_CPL_HD_Q_COR_ERR,
-       C_PCIC_POST_DAT_Q_COR_ERR,
-       C_PCIC_POST_HD_Q_COR_ERR,
-       C_PCIC_RETRY_SOT_MEM_COR_ERR,
-       C_PCIC_RETRY_MEM_COR_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
-       C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
-       C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_CSR_CFG_BUS_PARITY_ERR,
-       C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
-       C_CCE_RSPD_DATA_PARITY_ERR,
-       C_CCE_TRGT_ACCESS_ERR,
-       C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
-       C_CCE_CSR_WRITE_BAD_ADDR_ERR,
-       C_CCE_CSR_READ_BAD_ADDR_ERR,
-       C_CCE_CSR_PARITY_ERR,
-/* RcvErrStatus */
-       C_RX_CSR_PARITY_ERR,
-       C_RX_CSR_WRITE_BAD_ADDR_ERR,
-       C_RX_CSR_READ_BAD_ADDR_ERR,
-       C_RX_DMA_CSR_UNC_ERR,
-       C_RX_DMA_DQ_FSM_ENCODING_ERR,
-       C_RX_DMA_EQ_FSM_ENCODING_ERR,
-       C_RX_DMA_CSR_PARITY_ERR,
-       C_RX_RBUF_DATA_COR_ERR,
-       C_RX_RBUF_DATA_UNC_ERR,
-       C_RX_DMA_DATA_FIFO_RD_COR_ERR,
-       C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
-       C_RX_DMA_HDR_FIFO_RD_COR_ERR,
-       C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
-       C_RX_RBUF_DESC_PART2_COR_ERR,
-       C_RX_RBUF_DESC_PART2_UNC_ERR,
-       C_RX_RBUF_DESC_PART1_COR_ERR,
-       C_RX_RBUF_DESC_PART1_UNC_ERR,
-       C_RX_HQ_INTR_FSM_ERR,
-       C_RX_HQ_INTR_CSR_PARITY_ERR,
-       C_RX_LOOKUP_CSR_PARITY_ERR,
-       C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
-       C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
-       C_RX_LOOKUP_DES_PART2_PARITY_ERR,
-       C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
-       C_RX_LOOKUP_DES_PART1_UNC_ERR,
-       C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
-       C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
-       C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
-       C_RX_RBUF_FL_INITDONE_PARITY_ERR,
-       C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
-       C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
-       C_RX_RBUF_EMPTY_ERR,
-       C_RX_RBUF_FULL_ERR,
-       C_RX_RBUF_BAD_LOOKUP_ERR,
-       C_RX_RBUF_CTX_ID_PARITY_ERR,
-       C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
-       C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
-       C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
-       C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
-       C_RX_RBUF_LOOKUP_DES_COR_ERR,
-       C_RX_RBUF_LOOKUP_DES_UNC_ERR,
-       C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
-       C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
-       C_RX_RBUF_FREE_LIST_COR_ERR,
-       C_RX_RBUF_FREE_LIST_UNC_ERR,
-       C_RX_RCV_FSM_ENCODING_ERR,
-       C_RX_DMA_FLAG_COR_ERR,
-       C_RX_DMA_FLAG_UNC_ERR,
-       C_RX_DC_SOP_EOP_PARITY_ERR,
-       C_RX_RCV_CSR_PARITY_ERR,
-       C_RX_RCV_QP_MAP_TABLE_COR_ERR,
-       C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
-       C_RX_RCV_DATA_COR_ERR,
-       C_RX_RCV_DATA_UNC_ERR,
-       C_RX_RCV_HDR_COR_ERR,
-       C_RX_RCV_HDR_UNC_ERR,
-       C_RX_DC_INTF_PARITY_ERR,
-       C_RX_DMA_CSR_COR_ERR,
-/* SendPioErrStatus */
-       C_PIO_PEC_SOP_HEAD_PARITY_ERR,
-       C_PIO_PCC_SOP_HEAD_PARITY_ERR,
-       C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
-       C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
-       C_PIO_RSVD_31_ERR,
-       C_PIO_RSVD_30_ERR,
-       C_PIO_PPMC_SOP_LEN_ERR,
-       C_PIO_PPMC_BQC_MEM_PARITY_ERR,
-       C_PIO_VL_FIFO_PARITY_ERR,
-       C_PIO_VLF_SOP_PARITY_ERR,
-       C_PIO_VLF_V1_LEN_PARITY_ERR,
-       C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
-       C_PIO_WRITE_QW_VALID_PARITY_ERR,
-       C_PIO_STATE_MACHINE_ERR,
-       C_PIO_WRITE_DATA_PARITY_ERR,
-       C_PIO_HOST_ADDR_MEM_COR_ERR,
-       C_PIO_HOST_ADDR_MEM_UNC_ERR,
-       C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
-       C_PIO_INIT_SM_IN_ERR,
-       C_PIO_PPMC_PBL_FIFO_ERR,
-       C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
-       C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
-       C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
-       C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
-       C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
-       C_PIO_SM_PKT_RESET_PARITY_ERR,
-       C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
-       C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
-       C_PIO_SBRDCTL_CRREL_PARITY_ERR,
-       C_PIO_PEC_FIFO_PARITY_ERR,
-       C_PIO_PCC_FIFO_PARITY_ERR,
-       C_PIO_SB_MEM_FIFO1_ERR,
-       C_PIO_SB_MEM_FIFO0_ERR,
-       C_PIO_CSR_PARITY_ERR,
-       C_PIO_WRITE_ADDR_PARITY_ERR,
-       C_PIO_WRITE_BAD_CTXT_ERR,
-/* SendDmaErrStatus */
-       C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
-       C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
-       C_SDMA_CSR_PARITY_ERR,
-       C_SDMA_RPY_TAG_ERR,
-/* SendEgressErrStatus */
-       C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
-       C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
-       C_TX_EGRESS_FIFO_COR_ERR,
-       C_TX_READ_PIO_MEMORY_COR_ERR,
-       C_TX_READ_SDMA_MEMORY_COR_ERR,
-       C_TX_SB_HDR_COR_ERR,
-       C_TX_CREDIT_OVERRUN_ERR,
-       C_TX_LAUNCH_FIFO8_COR_ERR,
-       C_TX_LAUNCH_FIFO7_COR_ERR,
-       C_TX_LAUNCH_FIFO6_COR_ERR,
-       C_TX_LAUNCH_FIFO5_COR_ERR,
-       C_TX_LAUNCH_FIFO4_COR_ERR,
-       C_TX_LAUNCH_FIFO3_COR_ERR,
-       C_TX_LAUNCH_FIFO2_COR_ERR,
-       C_TX_LAUNCH_FIFO1_COR_ERR,
-       C_TX_LAUNCH_FIFO0_COR_ERR,
-       C_TX_CREDIT_RETURN_VL_ERR,
-       C_TX_HCRC_INSERTION_ERR,
-       C_TX_EGRESS_FIFI_UNC_ERR,
-       C_TX_READ_PIO_MEMORY_UNC_ERR,
-       C_TX_READ_SDMA_MEMORY_UNC_ERR,
-       C_TX_SB_HDR_UNC_ERR,
-       C_TX_CREDIT_RETURN_PARITY_ERR,
-       C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
-       C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
-       C_TX_SDMA15_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA14_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA13_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA12_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA11_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA10_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA9_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA8_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA7_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA6_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA5_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA4_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA3_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA2_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA1_DISALLOWED_PACKET_ERR,
-       C_TX_SDMA0_DISALLOWED_PACKET_ERR,
-       C_TX_CONFIG_PARITY_ERR,
-       C_TX_SBRD_CTL_CSR_PARITY_ERR,
-       C_TX_LAUNCH_CSR_PARITY_ERR,
-       C_TX_ILLEGAL_CL_ERR,
-       C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
-       C_TX_RESERVED_10,
-       C_TX_RESERVED_9,
-       C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
-       C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
-       C_TX_RESERVED_6,
-       C_TX_INCORRECT_LINK_STATE_ERR,
-       C_TX_LINK_DOWN_ERR,
-       C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
-       C_TX_RESERVED_2,
-       C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
-       C_TX_PKT_INTEGRITY_MEM_COR_ERR,
-/* SendErrStatus */
-       C_SEND_CSR_WRITE_BAD_ADDR_ERR,
-       C_SEND_CSR_READ_BAD_ADD_ERR,
-       C_SEND_CSR_PARITY_ERR,
-/* SendCtxtErrStatus */
-       C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
-       C_PIO_WRITE_OVERFLOW_ERR,
-       C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
-       C_PIO_DISALLOWED_PACKET_ERR,
-       C_PIO_INCONSISTENT_SOP_ERR,
-/*SendDmaEngErrStatus */
-       C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
-       C_SDMA_HEADER_STORAGE_COR_ERR,
-       C_SDMA_PACKET_TRACKING_COR_ERR,
-       C_SDMA_ASSEMBLY_COR_ERR,
-       C_SDMA_DESC_TABLE_COR_ERR,
-       C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
-       C_SDMA_HEADER_STORAGE_UNC_ERR,
-       C_SDMA_PACKET_TRACKING_UNC_ERR,
-       C_SDMA_ASSEMBLY_UNC_ERR,
-       C_SDMA_DESC_TABLE_UNC_ERR,
-       C_SDMA_TIMEOUT_ERR,
-       C_SDMA_HEADER_LENGTH_ERR,
-       C_SDMA_HEADER_ADDRESS_ERR,
-       C_SDMA_HEADER_SELECT_ERR,
-       C_SMDA_RESERVED_9,
-       C_SDMA_PACKET_DESC_OVERFLOW_ERR,
-       C_SDMA_LENGTH_MISMATCH_ERR,
-       C_SDMA_HALT_ERR,
-       C_SDMA_MEM_READ_ERR,
-       C_SDMA_FIRST_DESC_ERR,
-       C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
-       C_SDMA_TOO_LONG_ERR,
-       C_SDMA_GEN_MISMATCH_ERR,
-       C_SDMA_WRONG_DW_ERR,
-       DEV_CNTR_LAST  /* Must be kept last */
-};
-
-/* Per port counter indexes */
-enum {
-       C_TX_UNSUP_VL = 0,
-       C_TX_INVAL_LEN,
-       C_TX_MM_LEN_ERR,
-       C_TX_UNDERRUN,
-       C_TX_FLOW_STALL,
-       C_TX_DROPPED,
-       C_TX_HDR_ERR,
-       C_TX_PKT,
-       C_TX_WORDS,
-       C_TX_WAIT,
-       C_TX_FLIT_VL,
-       C_TX_PKT_VL,
-       C_TX_WAIT_VL,
-       C_RX_PKT,
-       C_RX_WORDS,
-       C_SW_LINK_DOWN,
-       C_SW_LINK_UP,
-       C_SW_UNKNOWN_FRAME,
-       C_SW_XMIT_DSCD,
-       C_SW_XMIT_DSCD_VL,
-       C_SW_XMIT_CSTR_ERR,
-       C_SW_RCV_CSTR_ERR,
-       C_SW_IBP_LOOP_PKTS,
-       C_SW_IBP_RC_RESENDS,
-       C_SW_IBP_RNR_NAKS,
-       C_SW_IBP_OTHER_NAKS,
-       C_SW_IBP_RC_TIMEOUTS,
-       C_SW_IBP_PKT_DROPS,
-       C_SW_IBP_DMA_WAIT,
-       C_SW_IBP_RC_SEQNAK,
-       C_SW_IBP_RC_DUPREQ,
-       C_SW_IBP_RDMA_SEQ,
-       C_SW_IBP_UNALIGNED,
-       C_SW_IBP_SEQ_NAK,
-       C_SW_CPU_RC_ACKS,
-       C_SW_CPU_RC_QACKS,
-       C_SW_CPU_RC_DELAYED_COMP,
-       C_RCV_HDR_OVF_0,
-       C_RCV_HDR_OVF_1,
-       C_RCV_HDR_OVF_2,
-       C_RCV_HDR_OVF_3,
-       C_RCV_HDR_OVF_4,
-       C_RCV_HDR_OVF_5,
-       C_RCV_HDR_OVF_6,
-       C_RCV_HDR_OVF_7,
-       C_RCV_HDR_OVF_8,
-       C_RCV_HDR_OVF_9,
-       C_RCV_HDR_OVF_10,
-       C_RCV_HDR_OVF_11,
-       C_RCV_HDR_OVF_12,
-       C_RCV_HDR_OVF_13,
-       C_RCV_HDR_OVF_14,
-       C_RCV_HDR_OVF_15,
-       C_RCV_HDR_OVF_16,
-       C_RCV_HDR_OVF_17,
-       C_RCV_HDR_OVF_18,
-       C_RCV_HDR_OVF_19,
-       C_RCV_HDR_OVF_20,
-       C_RCV_HDR_OVF_21,
-       C_RCV_HDR_OVF_22,
-       C_RCV_HDR_OVF_23,
-       C_RCV_HDR_OVF_24,
-       C_RCV_HDR_OVF_25,
-       C_RCV_HDR_OVF_26,
-       C_RCV_HDR_OVF_27,
-       C_RCV_HDR_OVF_28,
-       C_RCV_HDR_OVF_29,
-       C_RCV_HDR_OVF_30,
-       C_RCV_HDR_OVF_31,
-       C_RCV_HDR_OVF_32,
-       C_RCV_HDR_OVF_33,
-       C_RCV_HDR_OVF_34,
-       C_RCV_HDR_OVF_35,
-       C_RCV_HDR_OVF_36,
-       C_RCV_HDR_OVF_37,
-       C_RCV_HDR_OVF_38,
-       C_RCV_HDR_OVF_39,
-       C_RCV_HDR_OVF_40,
-       C_RCV_HDR_OVF_41,
-       C_RCV_HDR_OVF_42,
-       C_RCV_HDR_OVF_43,
-       C_RCV_HDR_OVF_44,
-       C_RCV_HDR_OVF_45,
-       C_RCV_HDR_OVF_46,
-       C_RCV_HDR_OVF_47,
-       C_RCV_HDR_OVF_48,
-       C_RCV_HDR_OVF_49,
-       C_RCV_HDR_OVF_50,
-       C_RCV_HDR_OVF_51,
-       C_RCV_HDR_OVF_52,
-       C_RCV_HDR_OVF_53,
-       C_RCV_HDR_OVF_54,
-       C_RCV_HDR_OVF_55,
-       C_RCV_HDR_OVF_56,
-       C_RCV_HDR_OVF_57,
-       C_RCV_HDR_OVF_58,
-       C_RCV_HDR_OVF_59,
-       C_RCV_HDR_OVF_60,
-       C_RCV_HDR_OVF_61,
-       C_RCV_HDR_OVF_62,
-       C_RCV_HDR_OVF_63,
-       C_RCV_HDR_OVF_64,
-       C_RCV_HDR_OVF_65,
-       C_RCV_HDR_OVF_66,
-       C_RCV_HDR_OVF_67,
-       C_RCV_HDR_OVF_68,
-       C_RCV_HDR_OVF_69,
-       C_RCV_HDR_OVF_70,
-       C_RCV_HDR_OVF_71,
-       C_RCV_HDR_OVF_72,
-       C_RCV_HDR_OVF_73,
-       C_RCV_HDR_OVF_74,
-       C_RCV_HDR_OVF_75,
-       C_RCV_HDR_OVF_76,
-       C_RCV_HDR_OVF_77,
-       C_RCV_HDR_OVF_78,
-       C_RCV_HDR_OVF_79,
-       C_RCV_HDR_OVF_80,
-       C_RCV_HDR_OVF_81,
-       C_RCV_HDR_OVF_82,
-       C_RCV_HDR_OVF_83,
-       C_RCV_HDR_OVF_84,
-       C_RCV_HDR_OVF_85,
-       C_RCV_HDR_OVF_86,
-       C_RCV_HDR_OVF_87,
-       C_RCV_HDR_OVF_88,
-       C_RCV_HDR_OVF_89,
-       C_RCV_HDR_OVF_90,
-       C_RCV_HDR_OVF_91,
-       C_RCV_HDR_OVF_92,
-       C_RCV_HDR_OVF_93,
-       C_RCV_HDR_OVF_94,
-       C_RCV_HDR_OVF_95,
-       C_RCV_HDR_OVF_96,
-       C_RCV_HDR_OVF_97,
-       C_RCV_HDR_OVF_98,
-       C_RCV_HDR_OVF_99,
-       C_RCV_HDR_OVF_100,
-       C_RCV_HDR_OVF_101,
-       C_RCV_HDR_OVF_102,
-       C_RCV_HDR_OVF_103,
-       C_RCV_HDR_OVF_104,
-       C_RCV_HDR_OVF_105,
-       C_RCV_HDR_OVF_106,
-       C_RCV_HDR_OVF_107,
-       C_RCV_HDR_OVF_108,
-       C_RCV_HDR_OVF_109,
-       C_RCV_HDR_OVF_110,
-       C_RCV_HDR_OVF_111,
-       C_RCV_HDR_OVF_112,
-       C_RCV_HDR_OVF_113,
-       C_RCV_HDR_OVF_114,
-       C_RCV_HDR_OVF_115,
-       C_RCV_HDR_OVF_116,
-       C_RCV_HDR_OVF_117,
-       C_RCV_HDR_OVF_118,
-       C_RCV_HDR_OVF_119,
-       C_RCV_HDR_OVF_120,
-       C_RCV_HDR_OVF_121,
-       C_RCV_HDR_OVF_122,
-       C_RCV_HDR_OVF_123,
-       C_RCV_HDR_OVF_124,
-       C_RCV_HDR_OVF_125,
-       C_RCV_HDR_OVF_126,
-       C_RCV_HDR_OVF_127,
-       C_RCV_HDR_OVF_128,
-       C_RCV_HDR_OVF_129,
-       C_RCV_HDR_OVF_130,
-       C_RCV_HDR_OVF_131,
-       C_RCV_HDR_OVF_132,
-       C_RCV_HDR_OVF_133,
-       C_RCV_HDR_OVF_134,
-       C_RCV_HDR_OVF_135,
-       C_RCV_HDR_OVF_136,
-       C_RCV_HDR_OVF_137,
-       C_RCV_HDR_OVF_138,
-       C_RCV_HDR_OVF_139,
-       C_RCV_HDR_OVF_140,
-       C_RCV_HDR_OVF_141,
-       C_RCV_HDR_OVF_142,
-       C_RCV_HDR_OVF_143,
-       C_RCV_HDR_OVF_144,
-       C_RCV_HDR_OVF_145,
-       C_RCV_HDR_OVF_146,
-       C_RCV_HDR_OVF_147,
-       C_RCV_HDR_OVF_148,
-       C_RCV_HDR_OVF_149,
-       C_RCV_HDR_OVF_150,
-       C_RCV_HDR_OVF_151,
-       C_RCV_HDR_OVF_152,
-       C_RCV_HDR_OVF_153,
-       C_RCV_HDR_OVF_154,
-       C_RCV_HDR_OVF_155,
-       C_RCV_HDR_OVF_156,
-       C_RCV_HDR_OVF_157,
-       C_RCV_HDR_OVF_158,
-       C_RCV_HDR_OVF_159,
-       PORT_CNTR_LAST /* Must be kept last */
-};
-
-u64 get_all_cpu_total(u64 __percpu *cntr);
-void hfi1_start_cleanup(struct hfi1_devdata *dd);
-void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
-struct hfi1_message_header *hfi1_get_msgheader(
-                               struct hfi1_devdata *dd, __le32 *rhf_addr);
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo);
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask);
-int hfi1_init_ctxt(struct send_context *sc);
-void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
-                 u32 type, unsigned long pa, u16 order);
-void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
-u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
-u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
-int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
-int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
-int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
-void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
-
-/*
- * Interrupt source table.
- *
- * Each entry is an interrupt source "type".  It is ordered by increasing
- * number.
- */
-struct is_table {
-       int start;       /* interrupt source type start */
-       int end;         /* interrupt source type end */
-       /* routine that returns the name of the interrupt source */
-       char *(*is_name)(char *name, size_t size, unsigned int source);
-       /* routine to call when receiving an interrupt */
-       void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
-};
-
-#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
deleted file mode 100644 (file)
index 8744de6..0000000
+++ /dev/null
@@ -1,1307 +0,0 @@
-#ifndef DEF_CHIP_REG
-#define DEF_CHIP_REG
-
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define CORE           0x000000000000
-#define CCE                    (CORE + 0x000000000000)
-#define ASIC           (CORE + 0x000000400000)
-#define MISC           (CORE + 0x000000500000)
-#define DC_TOP_CSRS            (CORE + 0x000000600000)
-#define CHIP_DEBUG             (CORE + 0x000000700000)
-#define RXE                    (CORE + 0x000001000000)
-#define TXE                    (CORE + 0x000001800000)
-#define DCC_CSRS               (DC_TOP_CSRS + 0x000000000000)
-#define DC_LCB_CSRS            (DC_TOP_CSRS + 0x000000001000)
-#define DC_8051_CSRS           (DC_TOP_CSRS + 0x000000002000)
-#define PCIE           0
-
-#define ASIC_NUM_SCRATCH 4
-#define CCE_ERR_INT_CNT 0
-#define CCE_MISC_INT_CNT 2
-#define CCE_NUM_32_BIT_COUNTERS 3
-#define CCE_NUM_32_BIT_INT_COUNTERS 6
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
-#define CCE_NUM_MSIX_PBAS 4
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_SCRATCH 4
-#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
-#define CCE_PCIE_TRGT_STALL_CNT 0
-#define CCE_PIO_WR_STALL_CNT 1
-#define CCE_RCV_AVAIL_INT_CNT 3
-#define CCE_RCV_URGENT_INT_CNT 4
-#define CCE_SDMA_INT_CNT 1
-#define CCE_SEND_CREDIT_INT_CNT 5
-#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
-#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
-#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
-#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
-#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
-#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
-#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
-#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
-#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
-#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
-#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
-#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
-#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
-#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
-#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
-#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
-#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
-#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
-#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
-#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
-#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
-#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
-#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
-#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
-#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
-#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
-#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
-#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
-#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
-#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
-#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
-#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
-#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
-#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
-#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
-#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
-#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
-#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
-#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
-#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
-#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
-#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
-#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
-#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
-#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
-#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
-#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
-#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
-#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
-#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
-#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
-#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
-#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
-#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
-#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
-#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
-#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
-#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
-#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
-#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
-#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
-#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
-#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
-#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
-#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
-#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
-#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
-#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
-#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
-#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
-#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
-#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
-#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
-#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
-#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
-#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
-#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
-#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
-#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
-#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
-#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
-#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
-#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
-#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
-#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
-#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
-#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
-#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
-#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
-#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
-#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
-#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
-#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
-#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
-#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
-#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
-#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
-#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
-#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
-#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
-#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
-#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
-#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
-#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
-#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
-#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
-#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
-#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
-#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
-#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
-#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
-#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
-#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
-#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
-#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
-#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
-#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
-#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
-#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
-#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
-#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
-#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
-#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
-#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
-#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
-#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
-#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
-#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
-#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
-#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
-#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
-#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
-#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
-#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
-#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
-#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
-#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
-#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
-#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
-#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
-#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
-#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
-#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
-#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
-#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
-#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
-#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
-#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
-#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
-#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
-#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
-#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
-#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
-#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
-#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
-#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
-#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
-#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
-#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
-#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
-#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
-#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
-#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
-#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
-#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
-#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
-#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
-#define DC_LCB_CFG_RUN_EN_SHIFT 0
-#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
-#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
-#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
-#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
-#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
-#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
-#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
-#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
-#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
-#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
-#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
-#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
-#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
-#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
-#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
-#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
-#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
-#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
-#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
-#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
-#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
-#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
-#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
-#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
-#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
-#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
-#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
-#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
-#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
-#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
-#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
-#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
-#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
-#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
-#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
-#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
-#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
-#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
-#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
-#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
-#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
-#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
-#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
-#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
-#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
-#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
-#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
-#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
-#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
-#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
-#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
-#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
-#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
-#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
-#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
-#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
-#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
-#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
-#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
-#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
-#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
-#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
-#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
-#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
-#define RCV_BUF_OVFL_CNT 10
-#define RCV_CONTEXT_EGR_STALL 22
-#define RCV_DATA_PKT_CNT 0
-#define RCV_DWORD_CNT 1
-#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
-#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
-#define RCV_TID_FULL_ERR_CNT 18
-#define RCV_TID_VALID_ERR_CNT 19
-#define RXE_NUM_32_BIT_COUNTERS 24
-#define RXE_NUM_64_BIT_COUNTERS 2
-#define RXE_NUM_RSM_INSTANCES 4
-#define RXE_NUM_TID_FLOWS 32
-#define RXE_PER_CONTEXT_OFFSET 0x0300000
-#define SEND_DATA_PKT_CNT 0
-#define SEND_DATA_PKT_VL0_CNT 12
-#define SEND_DATA_VL0_CNT 3
-#define SEND_DROPPED_PKT_CNT 5
-#define SEND_DWORD_CNT 1
-#define SEND_FLOW_STALL_CNT 4
-#define SEND_HEADERS_ERR_CNT 6
-#define SEND_LEN_ERR_CNT 1
-#define SEND_MAX_MIN_LEN_ERR_CNT 2
-#define SEND_UNDERRUN_CNT 3
-#define SEND_UNSUP_VL_ERR_CNT 0
-#define SEND_WAIT_CNT 2
-#define SEND_WAIT_VL0_CNT 21
-#define TXE_PIO_SEND_OFFSET 0x0800000
-#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
-#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
-#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
-#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
-#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
-#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
-#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
-#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
-#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
-#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
-#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
-#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
-#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
-#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
-#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
-#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
-#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
-#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
-#define ASIC_EEP_DATA (ASIC + 0x000000000310)
-#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
-#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
-#define ASIC_GPIO_IN (ASIC + 0x000000000200)
-#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
-#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
-#define ASIC_GPIO_OE (ASIC + 0x000000000208)
-#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
-#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
-#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
-#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
-#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
-#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
-#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
-#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
-#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
-#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
-#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
-#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
-#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
-#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
-#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
-#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
-#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
-#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
-#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
-#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
-#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
-#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
-#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
-#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
-#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
-#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
-#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
-#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
-#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
-#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
-#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
-#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
-#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
-#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
-#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
-#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
-#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
-#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
-#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
-#define ASIC_STS_THERM (ASIC + 0x000000000058)
-#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
-#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
-#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
-#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
-#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
-#define ASIC_STS_THERM_LOW_SHIFT 13
-#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
-#define CCE_CTRL (CCE + 0x000000000010)
-#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
-#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
-#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
-#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
-#define CCE_DC_CTRL (CCE + 0x0000000000B8)
-#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
-#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
-#define CCE_ERR_CLEAR (CCE + 0x000000000050)
-#define CCE_ERR_MASK (CCE + 0x000000000048)
-#define CCE_ERR_STATUS (CCE + 0x000000000040)
-#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
-               0x200ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
-               0x800ull
-#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
-               0x400ull
-#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
-#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
-#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
-#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
-#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
-#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
-#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
-#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
-#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
-#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
-#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
-#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
-#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
-#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
-#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
-#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
-#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
-#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
-#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
-#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
-#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
-#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
-#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
-#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
-#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
-#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
-#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
-#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
-#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
-#define CCE_INT_CLEAR (CCE + 0x000000110A00)
-#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
-#define CCE_INT_FORCE (CCE + 0x000000110B00)
-#define CCE_INT_MAP (CCE + 0x000000110500)
-#define CCE_INT_MASK (CCE + 0x000000110900)
-#define CCE_INT_STATUS (CCE + 0x000000110800)
-#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
-#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
-#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
-#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
-#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
-#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
-#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
-#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
-#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
-#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
-#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
-#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
-#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
-#define CCE_REVISION (CCE + 0x000000000000)
-#define CCE_REVISION2 (CCE + 0x000000000008)
-#define CCE_REVISION2_HFI_ID_MASK 0x1ull
-#define CCE_REVISION2_HFI_ID_SHIFT 0
-#define CCE_REVISION2_IMPL_CODE_SHIFT 8
-#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
-#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
-#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
-#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
-#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
-#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
-#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
-#define CCE_REVISION_SW_MASK 0xFFull
-#define CCE_REVISION_SW_SHIFT 24
-#define CCE_SCRATCH (CCE + 0x000000000020)
-#define CCE_STATUS (CCE + 0x000000000018)
-#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
-#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
-#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
-#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
-#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
-#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
-#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
-#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
-#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
-#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
-#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
-#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
-#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
-#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
-#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
-#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
-#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
-#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
-#define MISC_ERR_CLEAR (MISC + 0x000000002010)
-#define MISC_ERR_MASK (MISC + 0x000000002008)
-#define MISC_ERR_STATUS (MISC + 0x000000002000)
-#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
-#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
-#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
-#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
-#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
-#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
-#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
-#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
-#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
-#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
-#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
-#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
-#define PCI_CFG_REG1 (PCIE + 0x000000000004)
-#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
-#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
-#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
-#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
-#define RCV_ARRAY (RXE + 0x000000200000)
-#define RCV_ARRAY_CNT (RXE + 0x000000000018)
-#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
-#define RCV_ARRAY_RT_ADDR_SHIFT 0
-#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
-#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
-#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
-#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
-#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
-#define RCV_BTH_QP (RXE + 0x000000000028)
-#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
-#define RCV_BTH_QP_KDETH_QP_SHIFT 16
-#define RCV_BYPASS (RXE + 0x000000000038)
-#define RCV_CONTEXTS (RXE + 0x000000000010)
-#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
-#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
-#define RCV_CTRL (RXE + 0x000000000000)
-#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
-#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
-#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
-#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
-#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
-#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
-#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
-#define RCV_CTXT_CTRL (RXE + 0x000000100000)
-#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
-#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
-#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
-#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
-#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
-#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
-#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
-#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
-#define RCV_CTXT_STATUS (RXE + 0x000000100008)
-#define RCV_EGR_CTRL (RXE + 0x000000100010)
-#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
-#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
-#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
-#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
-#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
-#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
-#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
-#define RCV_ERR_CLEAR (RXE + 0x000000000070)
-#define RCV_ERR_INFO (RXE + 0x000000000050)
-#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
-#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
-#define RCV_ERR_MASK (RXE + 0x000000000068)
-#define RCV_ERR_STATUS (RXE + 0x000000000060)
-#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
-#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
-#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
-               0x4000000000000000ull
-#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
-#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
-               0x40000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
-               0x20000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
-               0x800000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
-               0x400000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
-#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
-#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
-#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
-#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
-#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
-               0x10000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
-               0x20000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
-#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
-#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
-#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
-               0x200000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
-               0x8000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
-#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
-               0x1000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
-#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
-#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
-#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
-#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
-#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
-#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
-#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
-#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
-#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
-#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
-#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
-#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
-#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
-#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
-#define RCV_HDR_ADDR (RXE + 0x000000100028)
-#define RCV_HDR_CNT (RXE + 0x000000100030)
-#define RCV_HDR_CNT_CNT_MASK 0x1FFull
-#define RCV_HDR_CNT_CNT_SHIFT 0
-#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
-#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
-#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
-#define RCV_HDR_HEAD (RXE + 0x000000300008)
-#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
-#define RCV_HDR_HEAD_COUNTER_SHIFT 32
-#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
-#define RCV_HDR_HEAD_HEAD_SHIFT 0
-#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
-#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
-#define RCV_HDR_SIZE (RXE + 0x000000100040)
-#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
-#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
-#define RCV_HDR_TAIL (RXE + 0x000000300000)
-#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
-#define RCV_KEY_CTRL (RXE + 0x000000100020)
-#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
-#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
-#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
-#define RCV_MULTICAST (RXE + 0x000000000030)
-#define RCV_PARTITION_KEY (RXE + 0x000000000200)
-#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
-#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
-#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
-#define RCV_RSM_CFG (RXE + 0x000000000600)
-#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
-#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
-#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
-#define RCV_RSM_CFG_OFFSET_SHIFT 32
-#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
-#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
-#define RCV_RSM_MATCH (RXE + 0x000000000800)
-#define RCV_RSM_MATCH_MASK1_SHIFT 0
-#define RCV_RSM_MATCH_MASK2_SHIFT 16
-#define RCV_RSM_MATCH_VALUE1_SHIFT 8
-#define RCV_RSM_MATCH_VALUE2_SHIFT 24
-#define RCV_RSM_SELECT (RXE + 0x000000000700)
-#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
-#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
-#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
-#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
-#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
-#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
-#define RCV_STATUS (RXE + 0x000000000008)
-#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
-#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
-#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
-#define RCV_TID_CTRL (RXE + 0x000000100018)
-#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
-#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
-#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
-#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
-#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
-#define RCV_VL15 (RXE + 0x000000000048)
-#define SEND_BTH_QP (TXE + 0x0000000000A0)
-#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
-#define SEND_BTH_QP_KDETH_QP_SHIFT 16
-#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
-#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
-               0x1000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
-               0x8000000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
-               0x2000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
-               0x4000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
-               0x8000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
-               0x10000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
-               0x20000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
-               0x40000000000000ull
-#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
-               0x80000000000000ull
-#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
-#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
-#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
-#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
-#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
-#define SEND_CM_CTRL (TXE + 0x000000000500)
-#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
-#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
-#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
-#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
-#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
-#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
-#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
-#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
-#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
-#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
-#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
-#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
-#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
-#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
-#define SEND_CONTEXTS (TXE + 0x000000000010)
-#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
-#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
-#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
-#define SEND_CTRL (TXE + 0x000000000000)
-#define SEND_CTRL_CM_RESET_SMASK 0x4ull
-#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
-#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
-#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
-#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
-#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
-               0x200000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
-               0x100000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
-               0x80000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
-               0x40000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
-               0x8000ull
-#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
-               0x4000ull
-#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
-#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
-#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
-#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
-#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
-#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
-#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
-#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
-#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
-#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
-#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
-#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
-#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
-#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
-#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
-#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
-#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
-#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
-#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
-#define SEND_CTXT_CTRL (TXE + 0x000000100000)
-#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
-#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
-#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
-#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
-#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
-#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
-#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
-#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
-#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
-#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
-#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
-#define SEND_CTXT_STATUS (TXE + 0x000000100008)
-#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
-#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
-#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
-#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
-#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
-               0x100000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
-               0x80000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
-               0x8000ull
-#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
-#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
-#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
-#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
-#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
-#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
-#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
-#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
-#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
-#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
-#define SEND_DMA_CTRL (TXE + 0x000000200000)
-#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
-#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
-#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
-#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
-#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
-#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
-#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
-#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
-#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
-#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
-#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
-#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
-               0x40000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
-               0x20000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
-               0x100ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
-               0x10000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
-#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
-#define SEND_DMA_ENGINES (TXE + 0x000000000018)
-#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
-#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
-#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
-#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
-#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
-#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
-#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
-#define SEND_DMA_HEAD (TXE + 0x000000200028)
-#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
-#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
-#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
-#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
-#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
-#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
-#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
-#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
-#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
-#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
-#define SEND_DMA_STATUS (TXE + 0x000000200008)
-#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
-#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
-#define SEND_DMA_TAIL (TXE + 0x000000200020)
-#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
-#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
-               0x3FFFull
-#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
-#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
-#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
-#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
-#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
-#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
-#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
-#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
-#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
-#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
-#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
-#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
-#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
-#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
-#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
-#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
-#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
-#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
-#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
-#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
-#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
-#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
-#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
-#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
-#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
-               0x200000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
-               0x20000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
-               0x800000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
-               0x2000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
-               0x200000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
-               0x8ull
-#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
-               0x400000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
-#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
-               0x1000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
-               0x100000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
-               0x2000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
-               0x200000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
-               0x4000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
-               0x400000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
-               0x8000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
-               0x800000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
-               0x10000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
-               0x1000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
-               0x20000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
-               0x2000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
-               0x40000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
-               0x4000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
-               0x80000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
-               0x8000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
-               0x100000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
-               0x10000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
-#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
-#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
-#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
-               0x1000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
-               0x8000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
-               0x100000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
-               0x800000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
-               0x4000000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
-               0x80000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
-               0x800ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
-               0x10000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
-               0x4000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
-               0x8000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
-               0x10000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
-               0x20000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
-               0x40000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
-               0x80000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
-               0x20000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
-               0x40000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
-               0x80000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
-               0x100000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
-               0x200000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
-               0x400000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
-               0x800000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
-               0x1000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
-               0x2000000ull
-#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
-               0x100ull
-#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
-#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
-#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
-               0x3FFFull
-#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
-#define SEND_ERR_MASK (TXE + 0x0000000000E8)
-#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
-#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
-#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
-#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
-#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
-#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
-#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
-#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
-#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
-#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
-#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
-#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
-#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
-#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
-#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
-#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
-#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
-#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
-#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
-#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
-#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
-#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
-#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
-#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
-#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
-#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
-               0x1000000ull
-#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
-#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
-#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
-               0x100000000ull
-#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
-#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
-#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
-#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
-               0x200000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
-#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
-               0x400000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
-#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
-               0x800000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
-#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
-#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
-#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
-#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
-#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
-#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
-               0x100ull
-#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
-#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
-#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
-#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
-#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
-#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
-#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
-#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
-#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
-#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
-#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
-#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
-#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
-#define SEND_SC2VLT0_SC0_SHIFT 0
-#define SEND_SC2VLT0_SC1_SHIFT 8
-#define SEND_SC2VLT0_SC2_SHIFT 16
-#define SEND_SC2VLT0_SC3_SHIFT 24
-#define SEND_SC2VLT0_SC4_SHIFT 32
-#define SEND_SC2VLT0_SC5_SHIFT 40
-#define SEND_SC2VLT0_SC6_SHIFT 48
-#define SEND_SC2VLT0_SC7_SHIFT 56
-#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
-#define SEND_SC2VLT1_SC10_SHIFT 16
-#define SEND_SC2VLT1_SC11_SHIFT 24
-#define SEND_SC2VLT1_SC12_SHIFT 32
-#define SEND_SC2VLT1_SC13_SHIFT 40
-#define SEND_SC2VLT1_SC14_SHIFT 48
-#define SEND_SC2VLT1_SC15_SHIFT 56
-#define SEND_SC2VLT1_SC8_SHIFT 0
-#define SEND_SC2VLT1_SC9_SHIFT 8
-#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
-#define SEND_SC2VLT2_SC16_SHIFT 0
-#define SEND_SC2VLT2_SC17_SHIFT 8
-#define SEND_SC2VLT2_SC18_SHIFT 16
-#define SEND_SC2VLT2_SC19_SHIFT 24
-#define SEND_SC2VLT2_SC20_SHIFT 32
-#define SEND_SC2VLT2_SC21_SHIFT 40
-#define SEND_SC2VLT2_SC22_SHIFT 48
-#define SEND_SC2VLT2_SC23_SHIFT 56
-#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
-#define SEND_SC2VLT3_SC24_SHIFT 0
-#define SEND_SC2VLT3_SC25_SHIFT 8
-#define SEND_SC2VLT3_SC26_SHIFT 16
-#define SEND_SC2VLT3_SC27_SHIFT 24
-#define SEND_SC2VLT3_SC28_SHIFT 32
-#define SEND_SC2VLT3_SC29_SHIFT 40
-#define SEND_SC2VLT3_SC30_SHIFT 48
-#define SEND_SC2VLT3_SC31_SHIFT 56
-#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
-#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
-#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
-#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
-#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
-#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
-#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
-#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
-#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
-#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
-#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
-#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
-#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
-#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
-#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
-#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
-#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
-#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
-#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
-#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
-#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
-#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
-#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
-#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
-#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
-#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
-#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
-#define CCE_MSIX_PBA_OFFSET 0X0110000
-
-#endif          /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
deleted file mode 100644 (file)
index e9b6bb3..0000000
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef _COMMON_H
-#define _COMMON_H
-
-#include <rdma/hfi/hfi1_user.h>
-
-/*
- * This file contains defines, structures, etc. that are used
- * to communicate between kernel and user code.
- */
-
-/* version of protocol header (known to chip also). In the long run,
- * we should be able to generate and accept a range of version numbers;
- * for now we only accept one, and it's compiled in.
- */
-#define IPS_PROTO_VERSION 2
-
-/*
- * These are compile time constants that you may want to enable or disable
- * if you are trying to debug problems with code or performance.
- * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
- * fast path code
- * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in fast path code
- * _HFI1_TRACING define as 0 if you want to remove all tracing in a
- * compilation unit
- */
-
-/*
- * If a packet's QP[23:16] bits match this value, then it is
- * a PSM packet and the hardware will expect a KDETH header
- * following the BTH.
- */
-#define DEFAULT_KDETH_QP 0x80
-
-/* driver/hw feature set bitmask */
-#define HFI1_CAP_USER_SHIFT      24
-#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
-/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
-#define HFI1_CAP_LOCKED_SHIFT    63
-#define HFI1_CAP_LOCKED_MASK     0x1ULL
-#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
-/* extra bits used between kernel and user processes */
-#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
-#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
-                                          HFI1_CAP_MISC_SHIFT)) - 1)
-
-#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
-#define HFI1_CAP_KCLEAR(cap)                                           \
-       ({                                                              \
-               hfi1_cap_mask &= ~HFI1_CAP_##cap;                       \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_USET(cap)                                             \
-       ({                                                              \
-               hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
-               hfi1_cap_mask;                                          \
-               })
-#define HFI1_CAP_UCLEAR(cap)                                           \
-       ({                                                              \
-               hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_SET(cap)                                              \
-       ({                                                              \
-               hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<   \
-                                                 HFI1_CAP_USER_SHIFT)); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_CLEAR(cap)                                            \
-       ({                                                              \
-               hfi1_cap_mask &= ~(HFI1_CAP_##cap |                     \
-                                 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
-               hfi1_cap_mask;                                          \
-       })
-#define HFI1_CAP_LOCK()                                                        \
-       ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
-#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
-/*
- * The set of capability bits that can be changed after initial load
- * This set is the same for kernel and user contexts. However, for
- * user contexts, the set can be further filtered by using the
- * HFI1_CAP_RESERVED_MASK bits.
- */
-#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |                  \
-                                 HFI1_CAP_HDRSUPP |                    \
-                                 HFI1_CAP_MULTI_PKT_EGR |              \
-                                 HFI1_CAP_NODROP_RHQ_FULL |            \
-                                 HFI1_CAP_NODROP_EGR_FULL |            \
-                                 HFI1_CAP_ALLOW_PERM_JKEY |            \
-                                 HFI1_CAP_STATIC_RATE_CTRL |           \
-                                 HFI1_CAP_PRINT_UNIMPL |               \
-                                 HFI1_CAP_TID_UNMAP)
-/*
- * A set of capability bits that are "global" and are not allowed to be
- * set in the user bitmask.
- */
-#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |                     \
-                                 HFI1_CAP_USE_SDMA_HEAD |              \
-                                 HFI1_CAP_EXTENDED_PSN |               \
-                                 HFI1_CAP_PRINT_UNIMPL |               \
-                                 HFI1_CAP_NO_INTEGRITY |               \
-                                 HFI1_CAP_PKEY_CHECK) <<               \
-                                HFI1_CAP_USER_SHIFT)
-/*
- * Set of capabilities that need to be enabled for kernel context in
- * order to be allowed for user contexts, as well.
- */
-#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
-/* Default enabled capabilities (both kernel and user) */
-#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |                   \
-                                HFI1_CAP_NODROP_RHQ_FULL |             \
-                                HFI1_CAP_NODROP_EGR_FULL |             \
-                                HFI1_CAP_SDMA |                        \
-                                HFI1_CAP_PRINT_UNIMPL |                \
-                                HFI1_CAP_STATIC_RATE_CTRL |            \
-                                HFI1_CAP_PKEY_CHECK |                  \
-                                HFI1_CAP_MULTI_PKT_EGR |               \
-                                HFI1_CAP_EXTENDED_PSN |                \
-                                ((HFI1_CAP_HDRSUPP |                   \
-                                  HFI1_CAP_MULTI_PKT_EGR |             \
-                                  HFI1_CAP_STATIC_RATE_CTRL |          \
-                                  HFI1_CAP_PKEY_CHECK |                \
-                                  HFI1_CAP_EARLY_CREDIT_RETURN) <<     \
-                                 HFI1_CAP_USER_SHIFT))
-/*
- * A bitmask of kernel/global capabilities that should be communicated
- * to user level processes.
- */
-#define HFI1_CAP_K2U (HFI1_CAP_SDMA |                  \
-                    HFI1_CAP_EXTENDED_PSN |            \
-                    HFI1_CAP_PKEY_CHECK |              \
-                    HFI1_CAP_NO_INTEGRITY)
-
-#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
-
-#ifndef HFI1_KERN_TYPE
-#define HFI1_KERN_TYPE 0
-#endif
-
-/*
- * Similarly, this is the kernel version going back to the user.  It's
- * slightly different, in that we want to tell if the driver was built as
- * part of a Intel release, or from the driver from openfabrics.org,
- * kernel.org, or a standard distribution, for support reasons.
- * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
- *
- * It's returned by the driver to the user code during initialization in the
- * spi_sw_version field of hfi1_base_info, so the user code can in turn
- * check for compatibility with the kernel.
-*/
-#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
-
-/*
- * Define the driver version number.  This is something that refers only
- * to the driver itself, not the software interfaces it supports.
- */
-#ifndef HFI1_DRIVER_VERSION_BASE
-#define HFI1_DRIVER_VERSION_BASE "0.9-294"
-#endif
-
-/* create the final driver version string */
-#ifdef HFI1_IDSTR
-#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
-#else
-#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
-#endif
-
-/*
- * Diagnostics can send a packet by writing the following
- * struct to the diag packet special file.
- *
- * This allows a custom PBC qword, so that special modes and deliberate
- * changes to CRCs can be used.
- */
-#define _DIAG_PKT_VERS 1
-struct diag_pkt {
-       __u16 version;          /* structure version */
-       __u16 unit;             /* which device */
-       __u16 sw_index;         /* send sw index to use */
-       __u16 len;              /* data length, in bytes */
-       __u16 port;             /* port number */
-       __u16 unused;
-       __u32 flags;            /* call flags */
-       __u64 data;             /* user data pointer */
-       __u64 pbc;              /* PBC for the packet */
-};
-
-/* diag_pkt flags */
-#define F_DIAGPKT_WAIT 0x1     /* wait until packet is sent */
-
-/*
- * The next set of defines are for packet headers, and chip register
- * and memory bits that are visible to and/or used by user-mode software.
- */
-
-/*
- * Receive Header Flags
- */
-#define RHF_PKT_LEN_SHIFT      0
-#define RHF_PKT_LEN_MASK       0xfffull
-#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
-
-#define RHF_RCV_TYPE_SHIFT     12
-#define RHF_RCV_TYPE_MASK      0x7ull
-#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
-
-#define RHF_USE_EGR_BFR_SHIFT  15
-#define RHF_USE_EGR_BFR_MASK   0x1ull
-#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
-
-#define RHF_EGR_INDEX_SHIFT    16
-#define RHF_EGR_INDEX_MASK     0x7ffull
-#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
-
-#define RHF_DC_INFO_SHIFT      27
-#define RHF_DC_INFO_MASK       0x1ull
-#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
-
-#define RHF_RCV_SEQ_SHIFT      28
-#define RHF_RCV_SEQ_MASK       0xfull
-#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
-
-#define RHF_EGR_OFFSET_SHIFT   32
-#define RHF_EGR_OFFSET_MASK    0xfffull
-#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
-#define RHF_HDRQ_OFFSET_SHIFT  44
-#define RHF_HDRQ_OFFSET_MASK   0x1ffull
-#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
-#define RHF_K_HDR_LEN_ERR      (0x1ull << 53)
-#define RHF_DC_UNC_ERR         (0x1ull << 54)
-#define RHF_DC_ERR             (0x1ull << 55)
-#define RHF_RCV_TYPE_ERR_SHIFT 56
-#define RHF_RCV_TYPE_ERR_MASK  0x7ul
-#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
-#define RHF_TID_ERR            (0x1ull << 59)
-#define RHF_LEN_ERR            (0x1ull << 60)
-#define RHF_ECC_ERR            (0x1ull << 61)
-#define RHF_VCRC_ERR           (0x1ull << 62)
-#define RHF_ICRC_ERR           (0x1ull << 63)
-
-#define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
-
-/* RHF receive types */
-#define RHF_RCV_TYPE_EXPECTED 0
-#define RHF_RCV_TYPE_EAGER    1
-#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
-#define RHF_RCV_TYPE_ERROR    3
-#define RHF_RCV_TYPE_BYPASS   4
-#define RHF_RCV_TYPE_INVALID5 5
-#define RHF_RCV_TYPE_INVALID6 6
-#define RHF_RCV_TYPE_INVALID7 7
-
-/* RHF receive type error - expected packet errors */
-#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR  0x2
-#define RHF_RTE_EXPECTED_FLOW_GEN_ERR  0x4
-
-/* RHF receive type error - eager packet errors */
-#define RHF_RTE_EAGER_NO_ERR           0x0
-
-/* RHF receive type error - IB packet errors */
-#define RHF_RTE_IB_NO_ERR              0x0
-
-/* RHF receive type error - error packet errors */
-#define RHF_RTE_ERROR_NO_ERR           0x0
-#define RHF_RTE_ERROR_OP_CODE_ERR      0x1
-#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
-#define RHF_RTE_ERROR_KHDR_HCRC_ERR    0x3
-#define RHF_RTE_ERROR_KHDR_KVER_ERR    0x4
-#define RHF_RTE_ERROR_CONTEXT_ERR      0x5
-#define RHF_RTE_ERROR_KHDR_TID_ERR     0x6
-
-/* RHF receive type error - bypass packet errors */
-#define RHF_RTE_BYPASS_NO_ERR          0x0
-
-/*
- * This structure contains the first field common to all protocols
- * that employ this chip.
- */
-struct hfi1_message_header {
-       __be16 lrh[4];
-};
-
-/* IB - LRH header constants */
-#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
-#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
-
-/* misc. */
-#define SIZE_OF_CRC 1
-
-#define LIM_MGMT_P_KEY       0x7FFF
-#define FULL_MGMT_P_KEY      0xFFFF
-
-#define DEFAULT_P_KEY LIM_MGMT_P_KEY
-#define HFI1_AETH_CREDIT_SHIFT 24
-#define HFI1_AETH_CREDIT_MASK 0x1F
-#define HFI1_AETH_CREDIT_INVAL 0x1F
-#define HFI1_MSN_MASK 0xFFFFFF
-#define HFI1_FECN_SHIFT 31
-#define HFI1_FECN_MASK 1
-#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
-#define HFI1_BECN_SHIFT 30
-#define HFI1_BECN_MASK 1
-#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
-
-static inline __u64 rhf_to_cpu(const __le32 *rbuf)
-{
-       return __le64_to_cpu(*((__le64 *)rbuf));
-}
-
-static inline u64 rhf_err_flags(u64 rhf)
-{
-       return rhf & RHF_ERROR_SMASK;
-}
-
-static inline u32 rhf_rcv_type(u64 rhf)
-{
-       return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
-}
-
-static inline u32 rhf_rcv_type_err(u64 rhf)
-{
-       return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
-}
-
-/* return size is in bytes, not DWORDs */
-static inline u32 rhf_pkt_len(u64 rhf)
-{
-       return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
-}
-
-static inline u32 rhf_egr_index(u64 rhf)
-{
-       return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
-}
-
-static inline u32 rhf_rcv_seq(u64 rhf)
-{
-       return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
-}
-
-/* returned offset is in DWORDS */
-static inline u32 rhf_hdrq_offset(u64 rhf)
-{
-       return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
-}
-
-static inline u64 rhf_use_egr_bfr(u64 rhf)
-{
-       return rhf & RHF_USE_EGR_BFR_SMASK;
-}
-
-static inline u64 rhf_dc_info(u64 rhf)
-{
-       return rhf & RHF_DC_INFO_SMASK;
-}
-
-static inline u32 rhf_egr_buf_offset(u64 rhf)
-{
-       return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
-}
-#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
deleted file mode 100644 (file)
index dbab9d9..0000000
+++ /dev/null
@@ -1,1145 +0,0 @@
-#ifdef CONFIG_DEBUG_FS
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "debugfs.h"
-#include "device.h"
-#include "qp.h"
-#include "sdma.h"
-
-static struct dentry *hfi1_dbg_root;
-
-#define private2dd(file) (file_inode(file)->i_private)
-#define private2ppd(file) (file_inode(file)->i_private)
-
-#define DEBUGFS_SEQ_FILE_OPS(name) \
-static const struct seq_operations _##name##_seq_ops = { \
-       .start = _##name##_seq_start, \
-       .next  = _##name##_seq_next, \
-       .stop  = _##name##_seq_stop, \
-       .show  = _##name##_seq_show \
-}
-
-#define DEBUGFS_SEQ_FILE_OPEN(name) \
-static int _##name##_open(struct inode *inode, struct file *s) \
-{ \
-       struct seq_file *seq; \
-       int ret; \
-       ret =  seq_open(s, &_##name##_seq_ops); \
-       if (ret) \
-               return ret; \
-       seq = s->private_data; \
-       seq->private = inode->i_private; \
-       return 0; \
-}
-
-#define DEBUGFS_FILE_OPS(name) \
-static const struct file_operations _##name##_file_ops = { \
-       .owner   = THIS_MODULE, \
-       .open    = _##name##_open, \
-       .read    = seq_read, \
-       .llseek  = seq_lseek, \
-       .release = seq_release \
-}
-
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)     \
-do { \
-       struct dentry *ent; \
-       ent = debugfs_create_file(name, mode, parent, \
-               data, ops); \
-       if (!ent) \
-               pr_warn("create of %s failed\n", name); \
-} while (0)
-
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
-       DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
-
-static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct hfi1_opcode_stats_perctx *opstats;
-
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(opstats->stats))
-               return NULL;
-       return pos;
-}
-
-static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_opcode_stats_perctx *opstats;
-
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(opstats->stats))
-               return NULL;
-       return pos;
-}
-
-static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _opcode_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-       loff_t i = *spos, j;
-       u64 n_packets = 0, n_bytes = 0;
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       for (j = 0; j < dd->first_user_ctxt; j++) {
-               if (!dd->rcd[j])
-                       continue;
-               n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
-               n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
-       }
-       if (!n_packets && !n_bytes)
-               return SEQ_SKIP;
-       seq_printf(s, "%02llx %llu/%llu\n", i,
-                  (unsigned long long)n_packets,
-                  (unsigned long long)n_bytes);
-
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(opcode_stats);
-DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
-DEBUGFS_FILE_OPS(opcode_stats);
-
-static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (!*pos)
-               return SEQ_START_TOKEN;
-       if (*pos >= dd->first_user_ctxt)
-               return NULL;
-       return pos;
-}
-
-static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (v == SEQ_START_TOKEN)
-               return pos;
-
-       ++*pos;
-       if (*pos >= dd->first_user_ctxt)
-               return NULL;
-       return pos;
-}
-
-static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
-{
-       /* nothing allocated */
-}
-
-static int _ctx_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos;
-       loff_t i, j;
-       u64 n_packets = 0;
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       if (v == SEQ_START_TOKEN) {
-               seq_puts(s, "Ctx:npkts\n");
-               return 0;
-       }
-
-       spos = v;
-       i = *spos;
-
-       if (!dd->rcd[i])
-               return SEQ_SKIP;
-
-       for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
-               n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
-
-       if (!n_packets)
-               return SEQ_SKIP;
-
-       seq_printf(s, "  %llu:%llu\n", i, n_packets);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(ctx_stats);
-DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
-DEBUGFS_FILE_OPS(ctx_stats);
-
-static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct qp_iter *iter;
-       loff_t n = *pos;
-
-       rcu_read_lock();
-       iter = qp_iter_init(s->private);
-       if (!iter)
-               return NULL;
-
-       while (n--) {
-               if (qp_iter_next(iter)) {
-                       kfree(iter);
-                       return NULL;
-               }
-       }
-
-       return iter;
-}
-
-static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
-                               loff_t *pos)
-{
-       struct qp_iter *iter = iter_ptr;
-
-       (*pos)++;
-
-       if (qp_iter_next(iter)) {
-               kfree(iter);
-               return NULL;
-       }
-
-       return iter;
-}
-
-static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
-{
-       struct qp_iter *iter = iter_ptr;
-
-       if (!iter)
-               return 0;
-
-       qp_iter_print(s, iter);
-
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(qp_stats);
-DEBUGFS_SEQ_FILE_OPEN(qp_stats)
-DEBUGFS_FILE_OPS(qp_stats);
-
-static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       struct hfi1_ibdev *ibd;
-       struct hfi1_devdata *dd;
-
-       rcu_read_lock();
-       ibd = (struct hfi1_ibdev *)s->private;
-       dd = dd_from_dev(ibd);
-       if (!dd->per_sdma || *pos >= dd->num_sdma)
-               return NULL;
-       return pos;
-}
-
-static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-
-       ++*pos;
-       if (!dd->per_sdma || *pos >= dd->num_sdma)
-               return NULL;
-       return pos;
-}
-
-static void _sdes_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _sdes_seq_show(struct seq_file *s, void *v)
-{
-       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-       loff_t *spos = v;
-       loff_t i = *spos;
-
-       sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(sdes);
-DEBUGFS_SEQ_FILE_OPEN(sdes)
-DEBUGFS_FILE_OPS(sdes);
-
-/* read the per-device counters */
-static ssize_t dev_counters_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       u64 *counters;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_cntrs(dd, NULL, &counters);
-       rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-/* read the per-device counters */
-static ssize_t dev_names_read(struct file *file, char __user *buf,
-                             size_t count, loff_t *ppos)
-{
-       char *names;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_cntrs(dd, &names, NULL);
-       rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-struct counter_info {
-       char *name;
-       const struct file_operations ops;
-};
-
-/*
- * Could use file_inode(file)->i_ino to figure out which file,
- * instead of separate routine for each, but for now, this works...
- */
-
-/* read the per-port names (same for each port) */
-static ssize_t portnames_read(struct file *file, char __user *buf,
-                             size_t count, loff_t *ppos)
-{
-       char *names;
-       size_t avail;
-       struct hfi1_devdata *dd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       dd = private2dd(file);
-       avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
-       rval = simple_read_from_buffer(buf, count, ppos, names, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-/* read the per-port counters */
-static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
-                                     size_t count, loff_t *ppos)
-{
-       u64 *counters;
-       size_t avail;
-       struct hfi1_pportdata *ppd;
-       ssize_t rval;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       avail = hfi1_read_portcntrs(ppd, NULL, &counters);
-       rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
-       rcu_read_unlock();
-       return rval;
-}
-
-static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
-                          int this_hfi, int hfi, u32 flag, const char *what)
-{
-       u32 mask;
-
-       mask = flag << (hfi ? CR_DYN_SHIFT : 0);
-       if (scratch0 & mask) {
-               *used += scnprintf(p + *used, size - *used,
-                                  "  0x%08x - HFI%d %s in use, %s device\n",
-                                  mask, hfi, what,
-                                  this_hfi == hfi ? "this" : "other");
-       }
-}
-
-static ssize_t asic_flags_read(struct file *file, char __user *buf,
-                              size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u64 scratch0;
-       char *tmp;
-       int ret = 0;
-       int size;
-       int used;
-       int i;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       dd = ppd->dd;
-       size = PAGE_SIZE;
-       used = 0;
-       tmp = kmalloc(size, GFP_KERNEL);
-       if (!tmp) {
-               rcu_read_unlock();
-               return -ENOMEM;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       used += scnprintf(tmp + used, size - used,
-                         "Resource flags: 0x%016llx\n", scratch0);
-
-       /* check permanent flag */
-       if (scratch0 & CR_THERM_INIT) {
-               used += scnprintf(tmp + used, size - used,
-                                 "  0x%08x - thermal monitoring initialized\n",
-                                 (u32)CR_THERM_INIT);
-       }
-
-       /* check each dynamic flag on each HFI */
-       for (i = 0; i < 2; i++) {
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_SBUS, "SBus");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_EPROM, "EPROM");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_I2C1, "i2c chain 1");
-               check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
-                              CR_I2C2, "i2c chain 2");
-       }
-       used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
-
-       ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
-       rcu_read_unlock();
-       kfree(tmp);
-       return ret;
-}
-
-static ssize_t asic_flags_write(struct file *file, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       char *buff;
-       int ret;
-       unsigned long long value;
-       u64 scratch0;
-       u64 clear;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       dd = ppd->dd;
-
-       buff = kmalloc(count + 1, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto do_return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto do_free;
-       }
-
-       /* zero terminate and read the expected integer */
-       buff[count] = 0;
-       ret = kstrtoull(buff, 0, &value);
-       if (ret)
-               goto do_free;
-       clear = value;
-
-       /* obtain exclusive access */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-       acquire_hw_mutex(dd);
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       scratch0 &= ~clear;
-       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-       /* force write to be visible to other HFI on another OS */
-       (void)read_csr(dd, ASIC_CFG_SCRATCH);
-
-       release_hw_mutex(dd);
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-
-       /* return the number of bytes written */
-       ret = count;
-
- do_free:
-       kfree(buff);
- do_return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/*
- * read the per-port QSFP data for ppd
- */
-static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       struct hfi1_pportdata *ppd;
-       char *tmp;
-       int ret;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-       tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!tmp) {
-               rcu_read_unlock();
-               return -ENOMEM;
-       }
-
-       ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
-       if (ret > 0)
-               ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
-       rcu_read_unlock();
-       kfree(tmp);
-       return ret;
-}
-
-/* Do an i2c write operation on the chain for the given HFI. */
-static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int i2c_addr;
-       int offset;
-       int total_written;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-
-       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
-       i2c_addr = (*ppos >> 16) & 0xffff;
-       offset = *ppos & 0xffff;
-
-       /* explicitly reject invalid address 0 to catch cp and cat */
-       if (i2c_addr == 0) {
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
-       if (total_written < 0) {
-               ret = total_written;
-               goto _free;
-       }
-
-       *ppos += total_written;
-
-       ret = total_written;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do an i2c write operation on chain for HFI 0. */
-static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_write(file, buf, count, ppos, 0);
-}
-
-/* Do an i2c write operation on chain for HFI 1. */
-static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_write(file, buf, count, ppos, 1);
-}
-
-/* Do an i2c read operation on the chain for the given HFI. */
-static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int i2c_addr;
-       int offset;
-       int total_read;
-
-       rcu_read_lock();
-       ppd = private2ppd(file);
-
-       /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
-       i2c_addr = (*ppos >> 16) & 0xffff;
-       offset = *ppos & 0xffff;
-
-       /* explicitly reject invalid address 0 to catch cp and cat */
-       if (i2c_addr == 0) {
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
-       if (total_read < 0) {
-               ret = total_read;
-               goto _free;
-       }
-
-       *ppos += total_read;
-
-       ret = copy_to_user(buf, buff, total_read);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       ret = total_read;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do an i2c read operation on chain for HFI 0. */
-static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_read(file, buf, count, ppos, 0);
-}
-
-/* Do an i2c read operation on chain for HFI 1. */
-static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return __i2c_debugfs_read(file, buf, count, ppos, 1);
-}
-
-/* Do a QSFP write operation on the i2c chain for the given HFI. */
-static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
-                                   size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int total_written;
-
-       rcu_read_lock();
-       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       ppd = private2ppd(file);
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       ret = copy_from_user(buff, buf, count);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       total_written = qsfp_write(ppd, target, *ppos, buff, count);
-       if (total_written < 0) {
-               ret = total_written;
-               goto _free;
-       }
-
-       *ppos += total_written;
-
-       ret = total_written;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do a QSFP write operation on i2c chain for HFI 0. */
-static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_write(file, buf, count, ppos, 0);
-}
-
-/* Do a QSFP write operation on i2c chain for HFI 1. */
-static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
-                                  size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_write(file, buf, count, ppos, 1);
-}
-
-/* Do a QSFP read operation on the i2c chain for the given HFI. */
-static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
-                                  size_t count, loff_t *ppos, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       char *buff;
-       int ret;
-       int total_read;
-
-       rcu_read_lock();
-       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
-               ret = -EINVAL;
-               goto _return;
-       }
-
-       ppd = private2ppd(file);
-
-       buff = kmalloc(count, GFP_KERNEL);
-       if (!buff) {
-               ret = -ENOMEM;
-               goto _return;
-       }
-
-       total_read = qsfp_read(ppd, target, *ppos, buff, count);
-       if (total_read < 0) {
-               ret = total_read;
-               goto _free;
-       }
-
-       *ppos += total_read;
-
-       ret = copy_to_user(buf, buff, total_read);
-       if (ret > 0) {
-               ret = -EFAULT;
-               goto _free;
-       }
-
-       ret = total_read;
-
- _free:
-       kfree(buff);
- _return:
-       rcu_read_unlock();
-       return ret;
-}
-
-/* Do a QSFP read operation on i2c chain for HFI 0. */
-static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_read(file, buf, count, ppos, 0);
-}
-
-/* Do a QSFP read operation on i2c chain for HFI 1. */
-static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
-                                 size_t count, loff_t *ppos)
-{
-       return __qsfp_debugfs_read(file, buf, count, ppos, 1);
-}
-
-static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       int ret;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       ppd = private2ppd(fp);
-
-       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
-       if (ret) /* failed - release the module */
-               module_put(THIS_MODULE);
-
-       return ret;
-}
-
-static int i2c1_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_open(in, fp, 0);
-}
-
-static int i2c2_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_open(in, fp, 1);
-}
-
-static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-
-       ppd = private2ppd(fp);
-
-       release_chip_resource(ppd->dd, i2c_target(target));
-       module_put(THIS_MODULE);
-
-       return 0;
-}
-
-static int i2c1_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_release(in, fp, 0);
-}
-
-static int i2c2_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __i2c_debugfs_release(in, fp, 1);
-}
-
-static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-       int ret;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       ppd = private2ppd(fp);
-
-       ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
-       if (ret) /* failed - release the module */
-               module_put(THIS_MODULE);
-
-       return ret;
-}
-
-static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_open(in, fp, 0);
-}
-
-static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_open(in, fp, 1);
-}
-
-static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
-{
-       struct hfi1_pportdata *ppd;
-
-       ppd = private2ppd(fp);
-
-       release_chip_resource(ppd->dd, i2c_target(target));
-       module_put(THIS_MODULE);
-
-       return 0;
-}
-
-static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_release(in, fp, 0);
-}
-
-static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
-{
-       return __qsfp_debugfs_release(in, fp, 1);
-}
-
-#define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
-{ \
-       .name = nm, \
-       .ops = { \
-               .read = readroutine, \
-               .write = writeroutine, \
-               .llseek = generic_file_llseek, \
-       }, \
-}
-
-#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
-{ \
-       .name = nm, \
-       .ops = { \
-               .read = readf, \
-               .write = writef, \
-               .llseek = generic_file_llseek, \
-               .open = openf, \
-               .release = releasef \
-       }, \
-}
-
-static const struct counter_info cntr_ops[] = {
-       DEBUGFS_OPS("counter_names", dev_names_read, NULL),
-       DEBUGFS_OPS("counters", dev_counters_read, NULL),
-       DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
-};
-
-static const struct counter_info port_cntr_ops[] = {
-       DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
-       DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
-                    i2c1_debugfs_open, i2c1_debugfs_release),
-       DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
-                    i2c2_debugfs_open, i2c2_debugfs_release),
-       DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
-       DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
-                    qsfp1_debugfs_open, qsfp1_debugfs_release),
-       DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
-                    qsfp2_debugfs_open, qsfp2_debugfs_release),
-       DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
-};
-
-void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
-{
-       char name[sizeof("port0counters") + 1];
-       char link[10];
-       struct hfi1_devdata *dd = dd_from_dev(ibd);
-       struct hfi1_pportdata *ppd;
-       int unit = dd->unit;
-       int i, j;
-
-       if (!hfi1_dbg_root)
-               return;
-       snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
-       snprintf(link, sizeof(link), "%d", unit);
-       ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
-       if (!ibd->hfi1_ibdev_dbg) {
-               pr_warn("create of %s failed\n", name);
-               return;
-       }
-       ibd->hfi1_ibdev_link =
-               debugfs_create_symlink(link, hfi1_dbg_root, name);
-       if (!ibd->hfi1_ibdev_link) {
-               pr_warn("create of %s symlink failed\n", name);
-               return;
-       }
-       DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
-       DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
-       /* dev counter files */
-       for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
-               DEBUGFS_FILE_CREATE(cntr_ops[i].name,
-                                   ibd->hfi1_ibdev_dbg,
-                                   dd,
-                                   &cntr_ops[i].ops, S_IRUGO);
-       /* per port files */
-       for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
-               for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
-                       snprintf(name,
-                                sizeof(name),
-                                port_cntr_ops[i].name,
-                                j + 1);
-                       DEBUGFS_FILE_CREATE(name,
-                                           ibd->hfi1_ibdev_dbg,
-                                           ppd,
-                                           &port_cntr_ops[i].ops,
-                                           !port_cntr_ops[i].ops.write ?
-                                           S_IRUGO : S_IRUGO | S_IWUSR);
-               }
-}
-
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
-{
-       if (!hfi1_dbg_root)
-               goto out;
-       debugfs_remove(ibd->hfi1_ibdev_link);
-       debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
-out:
-       ibd->hfi1_ibdev_dbg = NULL;
-       synchronize_rcu();
-}
-
-/*
- * driver stats field names, one line per stat, single string.  Used by
- * programs like hfistats to print the stats in a way which works for
- * different versions of drivers, without changing program source.
- * if hfi1_ib_stats changes, this needs to change.  Names need to be
- * 12 chars or less (w/o newline), for proper display by hfistats utility.
- */
-static const char * const hfi1_statnames[] = {
-       /* must be element 0*/
-       "KernIntr",
-       "ErrorIntr",
-       "Tx_Errs",
-       "Rcv_Errs",
-       "H/W_Errs",
-       "NoPIOBufs",
-       "CtxtsOpen",
-       "RcvLen_Errs",
-       "EgrBufFull",
-       "EgrHdrFull"
-};
-
-static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void *_driver_stats_names_seq_next(
-       struct seq_file *s,
-       void *v,
-       loff_t *pos)
-{
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-
-       seq_printf(s, "%s\n", hfi1_statnames[*spos]);
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
-DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
-DEBUGFS_FILE_OPS(driver_stats_names);
-
-static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
-__acquires(RCU)
-{
-       rcu_read_lock();
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-       ++*pos;
-       if (*pos >= ARRAY_SIZE(hfi1_statnames))
-               return NULL;
-       return pos;
-}
-
-static void _driver_stats_seq_stop(struct seq_file *s, void *v)
-__releases(RCU)
-{
-       rcu_read_unlock();
-}
-
-static u64 hfi1_sps_ints(void)
-{
-       unsigned long flags;
-       struct hfi1_devdata *dd;
-       u64 sps_ints = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               sps_ints += get_all_cpu_total(dd->int_counter);
-       }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return sps_ints;
-}
-
-static int _driver_stats_seq_show(struct seq_file *s, void *v)
-{
-       loff_t *spos = v;
-       char *buffer;
-       u64 *stats = (u64 *)&hfi1_stats;
-       size_t sz = seq_get_buf(s, &buffer);
-
-       if (sz < sizeof(u64))
-               return SEQ_SKIP;
-       /* special case for interrupts */
-       if (*spos == 0)
-               *(u64 *)buffer = hfi1_sps_ints();
-       else
-               *(u64 *)buffer = stats[*spos];
-       seq_commit(s,  sizeof(u64));
-       return 0;
-}
-
-DEBUGFS_SEQ_FILE_OPS(driver_stats);
-DEBUGFS_SEQ_FILE_OPEN(driver_stats)
-DEBUGFS_FILE_OPS(driver_stats);
-
-void hfi1_dbg_init(void)
-{
-       hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
-       if (!hfi1_dbg_root)
-               pr_warn("init of debugfs failed\n");
-       DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
-       DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
-}
-
-void hfi1_dbg_exit(void)
-{
-       debugfs_remove_recursive(hfi1_dbg_root);
-       hfi1_dbg_root = NULL;
-}
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
deleted file mode 100644 (file)
index b6fb681..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _HFI1_DEBUGFS_H
-#define _HFI1_DEBUGFS_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-struct hfi1_ibdev;
-#ifdef CONFIG_DEBUG_FS
-void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
-void hfi1_dbg_init(void);
-void hfi1_dbg_exit(void);
-#else
-static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
-{
-}
-
-void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
-{
-}
-
-void hfi1_dbg_init(void)
-{
-}
-
-void hfi1_dbg_exit(void)
-{
-}
-
-#endif
-
-#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
deleted file mode 100644 (file)
index c05c39d..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/cdev.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-
-#include "hfi.h"
-#include "device.h"
-
-static struct class *class;
-static struct class *user_class;
-static dev_t hfi1_dev;
-
-int hfi1_cdev_init(int minor, const char *name,
-                  const struct file_operations *fops,
-                  struct cdev *cdev, struct device **devp,
-                  bool user_accessible)
-{
-       const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
-       struct device *device = NULL;
-       int ret;
-
-       cdev_init(cdev, fops);
-       cdev->owner = THIS_MODULE;
-       kobject_set_name(&cdev->kobj, name);
-
-       ret = cdev_add(cdev, dev, 1);
-       if (ret < 0) {
-               pr_err("Could not add cdev for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto done;
-       }
-
-       if (user_accessible)
-               device = device_create(user_class, NULL, dev, NULL, "%s", name);
-       else
-               device = device_create(class, NULL, dev, NULL, "%s", name);
-
-       if (!IS_ERR(device))
-               goto done;
-       ret = PTR_ERR(device);
-       device = NULL;
-       pr_err("Could not create device for minor %d, %s (err %d)\n",
-              minor, name, -ret);
-       cdev_del(cdev);
-done:
-       *devp = device;
-       return ret;
-}
-
-void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
-{
-       struct device *device = *devp;
-
-       if (device) {
-               device_unregister(device);
-               *devp = NULL;
-
-               cdev_del(cdev);
-       }
-}
-
-static const char *hfi1_class_name = "hfi1";
-
-const char *class_name(void)
-{
-       return hfi1_class_name;
-}
-
-static char *hfi1_devnode(struct device *dev, umode_t *mode)
-{
-       if (mode)
-               *mode = 0600;
-       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
-}
-
-static const char *hfi1_class_name_user = "hfi1_user";
-static const char *class_name_user(void)
-{
-       return hfi1_class_name_user;
-}
-
-static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
-{
-       if (mode)
-               *mode = 0666;
-       return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
-}
-
-int __init dev_init(void)
-{
-       int ret;
-
-       ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
-       if (ret < 0) {
-               pr_err("Could not allocate chrdev region (err %d)\n", -ret);
-               goto done;
-       }
-
-       class = class_create(THIS_MODULE, class_name());
-       if (IS_ERR(class)) {
-               ret = PTR_ERR(class);
-               pr_err("Could not create device class (err %d)\n", -ret);
-               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-               goto done;
-       }
-       class->devnode = hfi1_devnode;
-
-       user_class = class_create(THIS_MODULE, class_name_user());
-       if (IS_ERR(user_class)) {
-               ret = PTR_ERR(user_class);
-               pr_err("Could not create device class for user accessible files (err %d)\n",
-                      -ret);
-               class_destroy(class);
-               class = NULL;
-               user_class = NULL;
-               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-               goto done;
-       }
-       user_class->devnode = hfi1_user_devnode;
-
-done:
-       return ret;
-}
-
-void dev_cleanup(void)
-{
-       class_destroy(class);
-       class = NULL;
-
-       class_destroy(user_class);
-       user_class = NULL;
-
-       unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
-}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
deleted file mode 100644 (file)
index 5bb3e83..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef _HFI1_DEVICE_H
-#define _HFI1_DEVICE_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-int hfi1_cdev_init(int minor, const char *name,
-                  const struct file_operations *fops,
-                  struct cdev *cdev, struct device **devp,
-                  bool user_accessible);
-void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
-const char *class_name(void);
-int __init dev_init(void);
-void dev_cleanup(void);
-
-#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
deleted file mode 100644 (file)
index bb2409a..0000000
+++ /dev/null
@@ -1,1925 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the hfi1_diag device, normally minor number 129.  Diagnostic use
- * of the chip may render the chip or board unusable until the driver
- * is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/vmalloc.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <rdma/ib_smi.h>
-#include "hfi.h"
-#include "device.h"
-#include "common.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define snoop_dbg(fmt, ...) \
-       hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
-
-/* Snoop option mask */
-#define SNOOP_DROP_SEND                BIT(0)
-#define SNOOP_USE_METADATA     BIT(1)
-#define SNOOP_SET_VL0TOVL15     BIT(2)
-
-static u8 snoop_flags;
-
-/*
- * Extract packet length from LRH header.
- * This is in Dwords so multiply by 4 to get size in bytes
- */
-#define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2)
-
-enum hfi1_filter_status {
-       HFI1_FILTER_HIT,
-       HFI1_FILTER_ERR,
-       HFI1_FILTER_MISS
-};
-
-/* snoop processing functions */
-rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
-       [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
-       [RHF_RCV_TYPE_EAGER]    = snoop_recv_handler,
-       [RHF_RCV_TYPE_IB]       = snoop_recv_handler,
-       [RHF_RCV_TYPE_ERROR]    = snoop_recv_handler,
-       [RHF_RCV_TYPE_BYPASS]   = snoop_recv_handler,
-       [RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
-       [RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
-       [RHF_RCV_TYPE_INVALID7] = process_receive_invalid
-};
-
-/* Snoop packet structure */
-struct snoop_packet {
-       struct list_head list;
-       u32 total_len;
-       u8 data[];
-};
-
-/* Do not make these an enum or it will blow up the capture_md */
-#define PKT_DIR_EGRESS 0x0
-#define PKT_DIR_INGRESS 0x1
-
-/* Packet capture metadata returned to the user with the packet. */
-struct capture_md {
-       u8 port;
-       u8 dir;
-       u8 reserved[6];
-       union {
-               u64 pbc;
-               u64 rhf;
-       } u;
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev diagpkt_cdev;
-static struct device *diagpkt_device;
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-                            size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-       .owner = THIS_MODULE,
-       .write = diagpkt_write,
-       .llseek = noop_llseek,
-};
-
-/*
- * This is used for communication with user space for snoop extended IOCTLs
- */
-struct hfi1_link_info {
-       __be64 node_guid;
-       u8 port_mode;
-       u8 port_state;
-       u16 link_speed_active;
-       u16 link_width_active;
-       u16 vl15_init;
-       u8 port_number;
-       /*
-        * Add padding to make this a full IB SMP payload. Note: changing the
-        * size of this structure will make the IOCTLs created with _IOWR
-        * change.
-        * Be sure to run tests on all IOCTLs when making changes to this
-        * structure.
-        */
-       u8 res[47];
-};
-
-/*
- * This starts our ioctl sequence numbers *way* off from the ones
- * defined in ib_core.
- */
-#define SNOOP_CAPTURE_VERSION 0x1
-
-#define IB_IOCTL_MAGIC          0x1b /* See Documentation/ioctl-number.txt */
-#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
-#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
-
-#define HFI1_SNOOP_IOCGETLINKSTATE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
-#define HFI1_SNOOP_IOCSETLINKSTATE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 1)
-#define HFI1_SNOOP_IOCCLEARQUEUE \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 2)
-#define HFI1_SNOOP_IOCCLEARFILTER \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 3)
-#define HFI1_SNOOP_IOCSETFILTER \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 4)
-#define HFI1_SNOOP_IOCGETVERSION \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 5)
-#define HFI1_SNOOP_IOCSET_OPTS \
-       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6)
-
-/*
- * These offsets +6/+7 could change, but these are already known and used
- * IOCTL numbers so don't change them without a good reason.
- */
-#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
-       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6, \
-               struct hfi1_link_info)
-#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
-       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 7, \
-               struct hfi1_link_info)
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp);
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-                              size_t pkt_len, loff_t *off);
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off);
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
-static unsigned int hfi1_snoop_poll(struct file *fp,
-                                   struct poll_table_struct *wait);
-static int hfi1_snoop_release(struct inode *in, struct file *fp);
-
-struct hfi1_packet_filter_command {
-       int opcode;
-       int length;
-       void *value_ptr;
-};
-
-/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
-#define HFI1_SNOOP_INGRESS 0x1
-#define HFI1_SNOOP_EGRESS  0x2
-
-enum hfi1_packet_filter_opcodes {
-       FILTER_BY_LID,
-       FILTER_BY_DLID,
-       FILTER_BY_MAD_MGMT_CLASS,
-       FILTER_BY_QP_NUMBER,
-       FILTER_BY_PKT_TYPE,
-       FILTER_BY_SERVICE_LEVEL,
-       FILTER_BY_PKEY,
-       FILTER_BY_DIRECTION,
-};
-
-static const struct file_operations snoop_file_ops = {
-       .owner = THIS_MODULE,
-       .open = hfi1_snoop_open,
-       .read = hfi1_snoop_read,
-       .unlocked_ioctl = hfi1_ioctl,
-       .poll = hfi1_snoop_poll,
-       .write = hfi1_snoop_write,
-       .release = hfi1_snoop_release
-};
-
-struct hfi1_filter_array {
-       int (*filter)(void *, void *, void *);
-};
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-                                     void *value);
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-                                    void *value);
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-                                       void *value);
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
-
-static const struct hfi1_filter_array hfi1_filters[] = {
-       { hfi1_filter_lid },
-       { hfi1_filter_dlid },
-       { hfi1_filter_mad_mgmt_class },
-       { hfi1_filter_qp_number },
-       { hfi1_filter_ibpacket_type },
-       { hfi1_filter_ib_service_level },
-       { hfi1_filter_ib_pkey },
-       { hfi1_filter_direction },
-};
-
-#define HFI1_MAX_FILTERS       ARRAY_SIZE(hfi1_filters)
-#define HFI1_DIAG_MINOR_BASE   129
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
-
-int hfi1_diag_add(struct hfi1_devdata *dd)
-{
-       char name[16];
-       int ret = 0;
-
-       snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
-                dd->unit);
-       /*
-        * Do this for each device as opposed to the normal diagpkt
-        * interface which is one per host
-        */
-       ret = hfi1_snoop_add(dd, name);
-       if (ret)
-               dd_dev_err(dd, "Unable to init snoop/capture device");
-
-       snprintf(name, sizeof(name), "%s_diagpkt", class_name());
-       if (atomic_inc_return(&diagpkt_count) == 1) {
-               ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
-                                    &diagpkt_file_ops, &diagpkt_cdev,
-                                    &diagpkt_device, false);
-       }
-
-       return ret;
-}
-
-/* this must be called w/ dd->snoop_in_lock held */
-static void drain_snoop_list(struct list_head *queue)
-{
-       struct list_head *pos, *q;
-       struct snoop_packet *packet;
-
-       list_for_each_safe(pos, q, queue) {
-               packet = list_entry(pos, struct snoop_packet, list);
-               list_del(pos);
-               kfree(packet);
-       }
-}
-
-static void hfi1_snoop_remove(struct hfi1_devdata *dd)
-{
-       unsigned long flags = 0;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       drain_snoop_list(&dd->hfi1_snoop.queue);
-       hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-}
-
-void hfi1_diag_remove(struct hfi1_devdata *dd)
-{
-       hfi1_snoop_remove(dd);
-       if (atomic_dec_and_test(&diagpkt_count))
-               hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
-       hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
-}
-
-/*
- * Allocated structure shared between the credit return mechanism and
- * diagpkt_send().
- */
-struct diagpkt_wait {
-       struct completion credits_returned;
-       int code;
-       atomic_t count;
-};
-
-/*
- * When each side is finished with the structure, they call this.
- * The last user frees the structure.
- */
-static void put_diagpkt_wait(struct diagpkt_wait *wait)
-{
-       if (atomic_dec_and_test(&wait->count))
-               kfree(wait);
-}
-
-/*
- * Callback from the credit return code.  Set the complete, which
- * will let diapkt_send() continue.
- */
-static void diagpkt_complete(void *arg, int code)
-{
-       struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
-
-       wait->code = code;
-       complete(&wait->credits_returned);
-       put_diagpkt_wait(wait); /* finished with the structure */
-}
-
-/**
- * diagpkt_send - send a packet
- * @dp: diag packet descriptor
- */
-static ssize_t diagpkt_send(struct diag_pkt *dp)
-{
-       struct hfi1_devdata *dd;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       u32 *tmpbuf = NULL;
-       ssize_t ret = 0;
-       u32 pkt_len, total_len;
-       pio_release_cb credit_cb = NULL;
-       void *credit_arg = NULL;
-       struct diagpkt_wait *wait = NULL;
-       int trycount = 0;
-
-       dd = hfi1_lookup(dp->unit);
-       if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
-               ret = -ENODEV;
-               goto bail;
-       }
-       if (!(dd->flags & HFI1_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (dp->version != _DIAG_PKT_VERS) {
-               dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
-                          dp->version);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* send count must be an exact number of dwords */
-       if (dp->len & 3) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* there is only port 1 */
-       if (dp->port != 1) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* need a valid context */
-       if (dp->sw_index >= dd->num_send_contexts) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* can only use kernel contexts */
-       if (dd->send_contexts[dp->sw_index].type != SC_KERNEL &&
-           dd->send_contexts[dp->sw_index].type != SC_VL15) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* must be allocated */
-       sc = dd->send_contexts[dp->sw_index].sc;
-       if (!sc) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* must be enabled */
-       if (!(sc->flags & SCF_ENABLED)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* allocate a buffer and copy the data in */
-       tmpbuf = vmalloc(dp->len);
-       if (!tmpbuf) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       if (copy_from_user(tmpbuf,
-                          (const void __user *)(unsigned long)dp->data,
-                          dp->len)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       /*
-        * pkt_len is how much data we have to write, includes header and data.
-        * total_len is length of the packet in Dwords plus the PBC should not
-        * include the CRC.
-        */
-       pkt_len = dp->len >> 2;
-       total_len = pkt_len + 2; /* PBC + packet */
-
-       /* if 0, fill in a default */
-       if (dp->pbc == 0) {
-               struct hfi1_pportdata *ppd = dd->pport;
-
-               hfi1_cdbg(PKT, "Generating PBC");
-               dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
-       } else {
-               hfi1_cdbg(PKT, "Using passed in PBC");
-       }
-
-       hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
-
-       /*
-        * The caller wants to wait until the packet is sent and to
-        * check for errors.  The best we can do is wait until
-        * the buffer credits are returned and check if any packet
-        * error has occurred.  If there are any late errors, this
-        * could miss it.  If there are other senders who generate
-        * an error, this may find it.  However, in general, it
-        * should catch most.
-        */
-       if (dp->flags & F_DIAGPKT_WAIT) {
-               /* always force a credit return */
-               dp->pbc |= PBC_CREDIT_RETURN;
-               /* turn on credit return interrupts */
-               sc_add_credit_return_intr(sc);
-               wait = kmalloc(sizeof(*wait), GFP_KERNEL);
-               if (!wait) {
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-               init_completion(&wait->credits_returned);
-               atomic_set(&wait->count, 2);
-               wait->code = PRC_OK;
-
-               credit_cb = diagpkt_complete;
-               credit_arg = wait;
-       }
-
-retry:
-       pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
-       if (!pbuf) {
-               if (trycount == 0) {
-                       /* force a credit return and try again */
-                       sc_return_credits(sc);
-                       trycount = 1;
-                       goto retry;
-               }
-               /*
-                * No send buffer means no credit callback.  Undo
-                * the wait set-up that was done above.  We free wait
-                * because the callback will never be called.
-                */
-               if (dp->flags & F_DIAGPKT_WAIT) {
-                       sc_del_credit_return_intr(sc);
-                       kfree(wait);
-                       wait = NULL;
-               }
-               ret = -ENOSPC;
-               goto bail;
-       }
-
-       pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
-       /* no flush needed as the HW knows the packet size */
-
-       ret = sizeof(*dp);
-
-       if (dp->flags & F_DIAGPKT_WAIT) {
-               /* wait for credit return */
-               ret = wait_for_completion_interruptible(
-                                               &wait->credits_returned);
-               /*
-                * If the wait returns an error, the wait was interrupted,
-                * e.g. with a ^C in the user program.  The callback is
-                * still pending.  This is OK as the wait structure is
-                * kmalloc'ed and the structure will free itself when
-                * all users are done with it.
-                *
-                * A context disable occurs on a send context restart, so
-                * include that in the list of errors below to check for.
-                * NOTE: PRC_FILL_ERR is at best informational and cannot
-                * be depended on.
-                */
-               if (!ret && (((wait->code & PRC_STATUS_ERR) ||
-                             (wait->code & PRC_FILL_ERR) ||
-                             (wait->code & PRC_SC_DISABLE))))
-                       ret = -EIO;
-
-               put_diagpkt_wait(wait); /* finished with the structure */
-               sc_del_credit_return_intr(sc);
-       }
-
-bail:
-       vfree(tmpbuf);
-       return ret;
-}
-
-static ssize_t diagpkt_write(struct file *fp, const char __user *data,
-                            size_t count, loff_t *off)
-{
-       struct hfi1_devdata *dd;
-       struct send_context *sc;
-       u8 vl;
-
-       struct diag_pkt dp;
-
-       if (count != sizeof(dp))
-               return -EINVAL;
-
-       if (copy_from_user(&dp, data, sizeof(dp)))
-               return -EFAULT;
-
-       /*
-       * The Send Context is derived from the PbcVL value
-       * if PBC is populated
-       */
-       if (dp.pbc) {
-               dd = hfi1_lookup(dp.unit);
-               if (!dd)
-                       return -ENODEV;
-               vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-               sc = dd->vld[vl].sc;
-               if (sc) {
-                       dp.sw_index = sc->sw_index;
-                       hfi1_cdbg(
-                              PKT,
-                              "Packet sent over VL %d via Send Context %u(%u)",
-                              vl, sc->sw_index, sc->hw_context);
-               }
-       }
-
-       return diagpkt_send(&dp);
-}
-
-static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
-{
-       int ret = 0;
-
-       dd->hfi1_snoop.mode_flag = 0;
-       spin_lock_init(&dd->hfi1_snoop.snoop_lock);
-       INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
-       init_waitqueue_head(&dd->hfi1_snoop.waitq);
-
-       ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
-                            &snoop_file_ops,
-                            &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev,
-                            false);
-
-       if (ret) {
-               dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
-               hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
-                                 &dd->hfi1_snoop.class_dev);
-       }
-
-       return ret;
-}
-
-static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
-{
-       int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_lookup(unit);
-       return dd;
-}
-
-/* clear or restore send context integrity checks */
-static void adjust_integrity_checks(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       unsigned long sc_flags;
-       int i;
-
-       spin_lock_irqsave(&dd->sc_lock, sc_flags);
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               int enable;
-
-               sc = dd->send_contexts[i].sc;
-
-               if (!sc)
-                       continue;       /* not allocated */
-
-               enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
-                        dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
-
-               set_pio_integrity(sc);
-
-               if (enable) /* take HFI_CAP_* flags into account */
-                       hfi1_init_ctxt(sc);
-       }
-       spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
-}
-
-static int hfi1_snoop_open(struct inode *in, struct file *fp)
-{
-       int ret;
-       int mode_flag = 0;
-       unsigned long flags = 0;
-       struct hfi1_devdata *dd;
-       struct list_head *queue;
-
-       mutex_lock(&hfi1_mutex);
-
-       dd = hfi1_dd_from_sc_inode(in);
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       /*
-        * File mode determines snoop or capture. Some existing user
-        * applications expect the capture device to be able to be opened RDWR
-        * because they expect a dedicated capture device. For this reason we
-        * support a module param to force capture mode even if the file open
-        * mode matches snoop.
-        */
-       if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
-               snoop_dbg("Capture Enabled");
-               mode_flag = HFI1_PORT_CAPTURE_MODE;
-       } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
-               snoop_dbg("Snoop Enabled");
-               mode_flag = HFI1_PORT_SNOOP_MODE;
-       } else {
-               snoop_dbg("Invalid");
-               ret =  -EINVAL;
-               goto bail;
-       }
-       queue = &dd->hfi1_snoop.queue;
-
-       /*
-        * We are not supporting snoop and capture at the same time.
-        */
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       if (dd->hfi1_snoop.mode_flag) {
-               ret = -EBUSY;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               goto bail;
-       }
-
-       dd->hfi1_snoop.mode_flag = mode_flag;
-       drain_snoop_list(queue);
-
-       dd->hfi1_snoop.filter_callback = NULL;
-       dd->hfi1_snoop.filter_value = NULL;
-
-       /*
-        * Send side packet integrity checks are not helpful when snooping so
-        * disable and re-enable when we stop snooping.
-        */
-       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-               /* clear after snoop mode is on */
-               adjust_integrity_checks(dd); /* clear */
-
-               /*
-                * We also do not want to be doing the DLID LMC check for
-                * ingressed packets.
-                */
-               dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
-               write_csr(dd, DCC_CFG_PORT_CONFIG1,
-                         (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
-       }
-
-       /*
-        * As soon as we set these function pointers the recv and send handlers
-        * are active. This is a race condition so we must make sure to drain
-        * the queue and init filter values above. Technically we should add
-        * locking here but all that will happen is on recv a packet will get
-        * allocated and get stuck on the snoop_lock before getting added to the
-        * queue. Same goes for send.
-        */
-       dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
-       dd->process_pio_send = snoop_send_pio_handler;
-       dd->process_dma_send = snoop_send_pio_handler;
-       dd->pio_inline_send = snoop_inline_pio_send;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       ret = 0;
-
-bail:
-       mutex_unlock(&hfi1_mutex);
-
-       return ret;
-}
-
-static int hfi1_snoop_release(struct inode *in, struct file *fp)
-{
-       unsigned long flags = 0;
-       struct hfi1_devdata *dd;
-       int mode_flag;
-
-       dd = hfi1_dd_from_sc_inode(in);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       /* clear the snoop mode before re-adjusting send context CSRs */
-       mode_flag = dd->hfi1_snoop.mode_flag;
-       dd->hfi1_snoop.mode_flag = 0;
-
-       /*
-        * Drain the queue and clear the filters we are done with it. Don't
-        * forget to restore the packet integrity checks
-        */
-       drain_snoop_list(&dd->hfi1_snoop.queue);
-       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
-               /* restore after snoop mode is clear */
-               adjust_integrity_checks(dd); /* restore */
-
-               /*
-                * Also should probably reset the DCC_CONFIG1 register for DLID
-                * checking on incoming packets again. Use the value saved when
-                * opening the snoop device.
-                */
-               write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
-       }
-
-       dd->hfi1_snoop.filter_callback = NULL;
-       kfree(dd->hfi1_snoop.filter_value);
-       dd->hfi1_snoop.filter_value = NULL;
-
-       /*
-        * User is done snooping and capturing, return control to the normal
-        * handler. Re-enable SDMA handling.
-        */
-       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
-       dd->process_pio_send = hfi1_verbs_send_pio;
-       dd->process_dma_send = hfi1_verbs_send_dma;
-       dd->pio_inline_send = pio_copy;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-       snoop_dbg("snoop/capture device released");
-
-       return 0;
-}
-
-static unsigned int hfi1_snoop_poll(struct file *fp,
-                                   struct poll_table_struct *wait)
-{
-       int ret = 0;
-       unsigned long flags = 0;
-
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
-       if (!list_empty(&dd->hfi1_snoop.queue))
-               ret |= POLLIN | POLLRDNORM;
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       return ret;
-}
-
-static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off)
-{
-       struct diag_pkt dpkt;
-       struct hfi1_devdata *dd;
-       size_t ret;
-       u8 byte_two, sl, sc5, sc4, vl, byte_one;
-       struct send_context *sc;
-       u32 len;
-       u64 pbc;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       ppd = dd->pport;
-       snoop_dbg("received %lu bytes from user", count);
-
-       memset(&dpkt, 0, sizeof(struct diag_pkt));
-       dpkt.version = _DIAG_PKT_VERS;
-       dpkt.unit = dd->unit;
-       dpkt.port = 1;
-
-       if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
-               /*
-               * We need to generate the PBC and not let diagpkt_send do it,
-               * to do this we need the VL and the length in dwords.
-               * The VL can be determined by using the SL and looking up the
-               * SC. Then the SC can be converted into VL. The exception to
-               * this is those packets which are from an SMI queue pair.
-               * Since we can't detect anything about the QP here we have to
-               * rely on the SC. If its 0xF then we assume its SMI and
-               * do not look at the SL.
-               */
-               if (copy_from_user(&byte_one, data, 1))
-                       return -EINVAL;
-
-               if (copy_from_user(&byte_two, data + 1, 1))
-                       return -EINVAL;
-
-               sc4 = (byte_one >> 4) & 0xf;
-               if (sc4 == 0xF) {
-                       snoop_dbg("Detected VL15 packet ignoring SL in packet");
-                       vl = sc4;
-               } else {
-                       sl = (byte_two >> 4) & 0xf;
-                       ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1);
-                       sc5 = ibp->sl_to_sc[sl];
-                       vl = sc_to_vlt(dd, sc5);
-                       if (vl != sc4) {
-                               snoop_dbg("VL %d does not match SC %d of packet",
-                                         vl, sc4);
-                               return -EINVAL;
-                       }
-               }
-
-               sc = dd->vld[vl].sc; /* Look up the context based on VL */
-               if (sc) {
-                       dpkt.sw_index = sc->sw_index;
-                       snoop_dbg("Sending on context %u(%u)", sc->sw_index,
-                                 sc->hw_context);
-               } else {
-                       snoop_dbg("Could not find context for vl %d", vl);
-                       return -EINVAL;
-               }
-
-               len = (count >> 2) + 2; /* Add in PBC */
-               pbc = create_pbc(ppd, 0, 0, vl, len);
-       } else {
-               if (copy_from_user(&pbc, data, sizeof(pbc)))
-                       return -EINVAL;
-               vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
-               sc = dd->vld[vl].sc; /* Look up the context based on VL */
-               if (sc) {
-                       dpkt.sw_index = sc->sw_index;
-               } else {
-                       snoop_dbg("Could not find context for vl %d", vl);
-                       return -EINVAL;
-               }
-               data += sizeof(pbc);
-               count -= sizeof(pbc);
-       }
-       dpkt.len = count;
-       dpkt.data = (unsigned long)data;
-
-       snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
-                 (pbc >> 12) & 0xf,
-                 (pbc & 0xfff));
-
-       dpkt.pbc = pbc;
-       ret = diagpkt_send(&dpkt);
-       /*
-        * diagpkt_send only returns number of bytes in the diagpkt so patch
-        * that up here before returning.
-        */
-       if (ret == sizeof(dpkt))
-               return count;
-
-       return ret;
-}
-
-static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
-                              size_t pkt_len, loff_t *off)
-{
-       ssize_t ret = 0;
-       unsigned long flags = 0;
-       struct snoop_packet *packet = NULL;
-       struct hfi1_devdata *dd;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-
-       while (list_empty(&dd->hfi1_snoop.queue)) {
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-
-               if (fp->f_flags & O_NONBLOCK)
-                       return -EAGAIN;
-
-               if (wait_event_interruptible(
-                               dd->hfi1_snoop.waitq,
-                               !list_empty(&dd->hfi1_snoop.queue)))
-                       return -EINTR;
-
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       }
-
-       if (!list_empty(&dd->hfi1_snoop.queue)) {
-               packet = list_entry(dd->hfi1_snoop.queue.next,
-                                   struct snoop_packet, list);
-               list_del(&packet->list);
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               if (pkt_len >= packet->total_len) {
-                       if (copy_to_user(data, packet->data,
-                                        packet->total_len))
-                               ret = -EFAULT;
-                       else
-                               ret = packet->total_len;
-               } else {
-                       ret = -EINVAL;
-               }
-
-               kfree(packet);
-       } else {
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       }
-
-       return ret;
-}
-
-/**
- * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others
- * @ppd : ptr to hfi1 port data
- * @value : options from user space
- *
- * Assumes the rest of the CM credit registers are zero from a
- * previous global or credit reset.
- * Leave shared count at zero for both global and all vls.
- * In snoop mode ideally we don't use shared credits
- * Reserve 8.5k for VL15
- * If total credits less than 8.5kbytes return error.
- * Divide the rest of the credits across VL0 to VL7 and if
- * each of these levels has less than 34 credits (at least 2048 + 128 bytes)
- * return with an error.
- * The credit registers will be reset to zero on link negotiation or link up
- * so this function should be activated from user space only if the port has
- * gone past link negotiation and link up.
- *
- * Return -- 0 if successful else error condition
- *
- */
-static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd,
-                                          int value)
-{
-#define  OPA_MIN_PER_VL_CREDITS  34  /* 2048 + 128 bytes */
-       struct buffer_control t;
-       int i;
-       struct hfi1_devdata *dd = ppd->dd;
-       u16  total_credits = (value >> 16) & 0xffff;
-       u16  vl15_credits = dd->vl15_init / 2;
-       u16  per_vl_credits;
-       __be16 be_per_vl_credits;
-
-       if (!(ppd->host_link_state & HLS_UP))
-               goto err_exit;
-       if (total_credits  <  vl15_credits)
-               goto err_exit;
-
-       per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL;
-
-       if (per_vl_credits < OPA_MIN_PER_VL_CREDITS)
-               goto err_exit;
-
-       memset(&t, 0, sizeof(t));
-       be_per_vl_credits = cpu_to_be16(per_vl_credits);
-
-       for (i = 0; i < TXE_NUM_DATA_VL; i++)
-               t.vl[i].dedicated = be_per_vl_credits;
-
-       t.vl[15].dedicated  = cpu_to_be16(vl15_credits);
-       return set_buffer_control(ppd, &t);
-
-err_exit:
-       snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d",
-                 ppd->host_link_state, total_credits, vl15_credits);
-
-       return -EINVAL;
-}
-
-static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
-       struct hfi1_devdata *dd;
-       void *filter_value = NULL;
-       long ret = 0;
-       int value = 0;
-       u8 phys_state = 0;
-       u8 link_state = 0;
-       u16 dev_state = 0;
-       unsigned long flags = 0;
-       unsigned long *argp = NULL;
-       struct hfi1_packet_filter_command filter_cmd = {0};
-       int mode_flag = 0;
-       struct hfi1_pportdata *ppd = NULL;
-       unsigned int index;
-       struct hfi1_link_info link_info;
-       int read_cmd, write_cmd, read_ok, write_ok;
-
-       dd = hfi1_dd_from_sc_inode(fp->f_inode);
-       if (!dd)
-               return -ENODEV;
-
-       mode_flag = dd->hfi1_snoop.mode_flag;
-       read_cmd = _IOC_DIR(cmd) & _IOC_READ;
-       write_cmd = _IOC_DIR(cmd) & _IOC_WRITE;
-       write_ok = access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd));
-       read_ok = access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd));
-
-       if ((read_cmd && !write_ok) || (write_cmd && !read_ok))
-               return -EFAULT;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
-           (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
-           (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
-           (cmd != HFI1_SNOOP_IOCSETFILTER))
-               /* Capture devices are allowed only 3 operations
-                * 1.Clear capture queue
-                * 2.Clear capture filter
-                * 3.Set capture filter
-                * Other are invalid.
-                */
-               return -EINVAL;
-
-       switch (cmd) {
-       case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
-               memset(&link_info, 0, sizeof(link_info));
-
-               if (copy_from_user(&link_info,
-                                  (struct hfi1_link_info __user *)arg,
-                                  sizeof(link_info)))
-                       return -EFAULT;
-
-               value = link_info.port_state;
-               index = link_info.port_number;
-               if (index > dd->num_pports - 1)
-                       return -EINVAL;
-
-               ppd = &dd->pport[index];
-               if (!ppd)
-                       return -EINVAL;
-
-               /* What we want to transition to */
-               phys_state = (value >> 4) & 0xF;
-               link_state = value & 0xF;
-               snoop_dbg("Setting link state 0x%x", value);
-
-               switch (link_state) {
-               case IB_PORT_NOP:
-                       if (phys_state == 0)
-                               break;
-                               /* fall through */
-               case IB_PORT_DOWN:
-                       switch (phys_state) {
-                       case 0:
-                               dev_state = HLS_DN_DOWNDEF;
-                               break;
-                       case 2:
-                               dev_state = HLS_DN_POLL;
-                               break;
-                       case 3:
-                               dev_state = HLS_DN_DISABLE;
-                               break;
-                       default:
-                               return -EINVAL;
-                       }
-                       ret = set_link_state(ppd, dev_state);
-                       break;
-               case IB_PORT_ARMED:
-                       ret = set_link_state(ppd, HLS_UP_ARMED);
-                       if (!ret)
-                               send_idle_sma(dd, SMA_IDLE_ARM);
-                       break;
-               case IB_PORT_ACTIVE:
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (!ret)
-                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
-                       break;
-               default:
-                       return -EINVAL;
-               }
-
-               if (ret)
-                       break;
-               /* fall through */
-       case HFI1_SNOOP_IOCGETLINKSTATE:
-       case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
-               if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
-                       memset(&link_info, 0, sizeof(link_info));
-                       if (copy_from_user(&link_info,
-                                          (struct hfi1_link_info __user *)arg,
-                                          sizeof(link_info)))
-                               return -EFAULT;
-                       index = link_info.port_number;
-               } else {
-                       ret = __get_user(index, (int __user *)arg);
-                       if (ret !=  0)
-                               break;
-               }
-
-               if (index > dd->num_pports - 1)
-                       return -EINVAL;
-
-               ppd = &dd->pport[index];
-               if (!ppd)
-                       return -EINVAL;
-
-               value = hfi1_ibphys_portstate(ppd);
-               value <<= 4;
-               value |= driver_lstate(ppd);
-
-               snoop_dbg("Link port | Link State: %d", value);
-
-               if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
-                   (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
-                       link_info.port_state = value;
-                       link_info.node_guid = cpu_to_be64(ppd->guid);
-                       link_info.link_speed_active =
-                                               ppd->link_speed_active;
-                       link_info.link_width_active =
-                                               ppd->link_width_active;
-                       if (copy_to_user((struct hfi1_link_info __user *)arg,
-                                        &link_info, sizeof(link_info)))
-                               return -EFAULT;
-               } else {
-                       ret = __put_user(value, (int __user *)arg);
-               }
-               break;
-
-       case HFI1_SNOOP_IOCCLEARQUEUE:
-               snoop_dbg("Clearing snoop queue");
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               drain_snoop_list(&dd->hfi1_snoop.queue);
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-
-       case HFI1_SNOOP_IOCCLEARFILTER:
-               snoop_dbg("Clearing filter");
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               if (dd->hfi1_snoop.filter_callback) {
-                       /* Drain packets first */
-                       drain_snoop_list(&dd->hfi1_snoop.queue);
-                       dd->hfi1_snoop.filter_callback = NULL;
-               }
-               kfree(dd->hfi1_snoop.filter_value);
-               dd->hfi1_snoop.filter_value = NULL;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-
-       case HFI1_SNOOP_IOCSETFILTER:
-               snoop_dbg("Setting filter");
-               /* just copy command structure */
-               argp = (unsigned long *)arg;
-               if (copy_from_user(&filter_cmd, (void __user *)argp,
-                                  sizeof(filter_cmd)))
-                       return -EFAULT;
-
-               if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
-                       pr_alert("Invalid opcode in request\n");
-                       return -EINVAL;
-               }
-
-               snoop_dbg("Opcode %d Len %d Ptr %p",
-                         filter_cmd.opcode, filter_cmd.length,
-                         filter_cmd.value_ptr);
-
-               filter_value = kcalloc(filter_cmd.length, sizeof(u8),
-                                      GFP_KERNEL);
-               if (!filter_value)
-                       return -ENOMEM;
-
-               /* copy remaining data from userspace */
-               if (copy_from_user((u8 *)filter_value,
-                                  (void __user *)filter_cmd.value_ptr,
-                                  filter_cmd.length)) {
-                       kfree(filter_value);
-                       return -EFAULT;
-               }
-               /* Drain packets first */
-               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-               drain_snoop_list(&dd->hfi1_snoop.queue);
-               dd->hfi1_snoop.filter_callback =
-                       hfi1_filters[filter_cmd.opcode].filter;
-               /* just in case we see back to back sets */
-               kfree(dd->hfi1_snoop.filter_value);
-               dd->hfi1_snoop.filter_value = filter_value;
-               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-               break;
-       case HFI1_SNOOP_IOCGETVERSION:
-               value = SNOOP_CAPTURE_VERSION;
-               snoop_dbg("Getting version: %d", value);
-               ret = __put_user(value, (int __user *)arg);
-               break;
-       case HFI1_SNOOP_IOCSET_OPTS:
-               snoop_flags = 0;
-               ret = __get_user(value, (int __user *)arg);
-               if (ret != 0)
-                       break;
-
-               snoop_dbg("Setting snoop option %d", value);
-               if (value & SNOOP_DROP_SEND)
-                       snoop_flags |= SNOOP_DROP_SEND;
-               if (value & SNOOP_USE_METADATA)
-                       snoop_flags |= SNOOP_USE_METADATA;
-               if (value & (SNOOP_SET_VL0TOVL15)) {
-                       ppd = &dd->pport[0];  /* first port will do */
-                       ret = hfi1_assign_snoop_link_credits(ppd, value);
-               }
-               break;
-       default:
-               return -ENOTTY;
-       }
-
-       return ret;
-}
-
-static void snoop_list_add_tail(struct snoop_packet *packet,
-                               struct hfi1_devdata *dd)
-{
-       unsigned long flags = 0;
-
-       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
-       if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
-                  (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
-               list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
-               snoop_dbg("Added packet to list");
-       }
-
-       /*
-        * Technically we can could have closed the snoop device while waiting
-        * on the above lock and it is gone now. The snoop mode_flag will
-        * prevent us from adding the packet to the queue though.
-        */
-
-       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
-       wake_up_interruptible(&dd->hfi1_snoop.waitq);
-}
-
-static inline int hfi1_filter_check(void *val, const char *msg)
-{
-       if (!val) {
-               snoop_dbg("Error invalid %s value for filter", msg);
-               return HFI1_FILTER_ERR;
-       }
-       return 0;
-}
-
-static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
-               return HFI1_FILTER_HIT; /* matched */
-
-       return HFI1_FILTER_MISS; /* Not matched */
-}
-
-static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-/* Not valid for outgoing packets, send handler passes null for data*/
-static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
-                                     void *value)
-{
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       struct ib_smp *smp = NULL;
-       u32 qpn = 0;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(packet_data, "packet_data");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       /* Check for GRH */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-       else
-               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-
-       qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
-       if (qpn <= 1) {
-               smp = (struct ib_smp *)packet_data;
-               if (*((u8 *)value) == smp->mgmt_class)
-                       return HFI1_FILTER_HIT;
-               else
-                       return HFI1_FILTER_MISS;
-       }
-       return HFI1_FILTER_ERR;
-}
-
-static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
-{
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       /* Check for GRH */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
-       else
-               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
-       if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
-                                    void *value)
-{
-       u32 lnh = 0;
-       u8 opcode = 0;
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               return HFI1_FILTER_ERR;
-
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-       if (*((u8 *)value) == ((opcode >> 5) & 0x7))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
-                                       void *value)
-{
-       struct hfi1_ib_header *hdr;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
-{
-       u32 lnh = 0;
-       struct hfi1_ib_header *hdr;
-       struct hfi1_other_headers *ohdr = NULL;
-       int ret;
-
-       ret = hfi1_filter_check(ibhdr, "header");
-       if (ret)
-               return ret;
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       hdr = (struct hfi1_ib_header *)ibhdr;
-
-       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               return HFI1_FILTER_ERR;
-
-       /* P_key is 16-bit entity, however top most bit indicates
-        * type of membership. 0 for limited and 1 for Full.
-        * Limited members cannot accept information from other
-        * Limited members, but communication is allowed between
-        * every other combination of membership.
-        * Hence we'll omit comparing top-most bit while filtering
-        */
-
-       if ((*(u16 *)value & 0x7FFF) ==
-               ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
-               return HFI1_FILTER_HIT;
-
-       return HFI1_FILTER_MISS;
-}
-
-/*
- * If packet_data is NULL then this is coming from one of the send functions.
- * Thus we know if its an ingressed or egressed packet.
- */
-static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
-{
-       u8 user_dir = *(u8 *)value;
-       int ret;
-
-       ret = hfi1_filter_check(value, "user");
-       if (ret)
-               return ret;
-
-       if (packet_data) {
-               /* Incoming packet */
-               if (user_dir & HFI1_SNOOP_INGRESS)
-                       return HFI1_FILTER_HIT;
-       } else {
-               /* Outgoing packet */
-               if (user_dir & HFI1_SNOOP_EGRESS)
-                       return HFI1_FILTER_HIT;
-       }
-
-       return HFI1_FILTER_MISS;
-}
-
-/*
- * Allocate a snoop packet. The structure that is stored in the ring buffer, not
- * to be confused with an hfi packet type.
- */
-static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
-                                                 u32 data_len,
-                                                 u32 md_len)
-{
-       struct snoop_packet *packet;
-
-       packet = kzalloc(sizeof(*packet) + hdr_len + data_len
-                        + md_len,
-                        GFP_ATOMIC | __GFP_NOWARN);
-       if (likely(packet))
-               INIT_LIST_HEAD(&packet->list);
-
-       return packet;
-}
-
-/*
- * Instead of having snoop and capture code intermixed with the recv functions,
- * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
- * and land in here for snoop/capture but if not enabled the call will go
- * through as before. This gives us a single point to constrain all of the snoop
- * snoop recv logic. There is nothing special that needs to happen for bypass
- * packets. This routine should not try to look into the packet. It just copied
- * it. There is no guarantee for filters when it comes to bypass packets as
- * there is no specific support. Bottom line is this routine does now even know
- * what a bypass packet is.
- */
-int snoop_recv_handler(struct hfi1_packet *packet)
-{
-       struct hfi1_pportdata *ppd = packet->rcd->ppd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       int header_size = packet->hlen;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct snoop_packet *s_packet = NULL;
-       int ret;
-       int snoop_mode = 0;
-       u32 md_len = 0;
-       struct capture_md md;
-
-       snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
-                 data);
-
-       trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
-                           data);
-
-       if (!ppd->dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
-                                       ppd->dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               break;
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               break;
-       case HFI1_FILTER_HIT:
-
-               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-                       snoop_mode = 1;
-               if ((snoop_mode == 0) ||
-                   unlikely(snoop_flags & SNOOP_USE_METADATA))
-                       md_len = sizeof(struct capture_md);
-
-               s_packet = allocate_snoop_packet(header_size,
-                                                tlen - header_size,
-                                                md_len);
-
-               if (unlikely(!s_packet)) {
-                       dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-                       break;
-               }
-
-               if (md_len > 0) {
-                       memset(&md, 0, sizeof(struct capture_md));
-                       md.port = 1;
-                       md.dir = PKT_DIR_INGRESS;
-                       md.u.rhf = packet->rhf;
-                       memcpy(s_packet->data, &md, md_len);
-               }
-
-               /* We should always have a header */
-               if (hdr) {
-                       memcpy(s_packet->data + md_len, hdr, header_size);
-               } else {
-                       dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
-                       kfree(s_packet);
-                       break;
-               }
-
-               /*
-                * Packets with no data are possible. If there is no data needed
-                * to take care of the last 4 bytes which are normally included
-                * with data buffers and are included in tlen.  Since we kzalloc
-                * the buffer we do not need to set any values but if we decide
-                * not to use kzalloc we should zero them.
-                */
-               if (data)
-                       memcpy(s_packet->data + header_size + md_len, data,
-                              tlen - header_size);
-
-               s_packet->total_len = tlen + md_len;
-               snoop_list_add_tail(s_packet, ppd->dd);
-
-               /*
-                * If we are snooping the packet not capturing then throw away
-                * after adding to the list.
-                */
-               snoop_dbg("Capturing packet");
-               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
-                       snoop_dbg("Throwing packet away");
-                       /*
-                        * If we are dropping the packet we still may need to
-                        * handle the case where error flags are set, this is
-                        * normally done by the type specific handler but that
-                        * won't be called in this case.
-                        */
-                       if (unlikely(rhf_err_flags(packet->rhf)))
-                               handle_eflags(packet);
-
-                       /* throw the packet on the floor */
-                       return RHF_RCV_CONTINUE;
-               }
-               break;
-       default:
-               break;
-       }
-
-       /*
-        * We do not care what type of packet came in here - just pass it off
-        * to the normal handler.
-        */
-       return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
-                       (packet);
-}
-
-/*
- * Handle snooping and capturing packets when sdma is being used.
- */
-int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc)
-{
-       pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
-       snoop_dbg("Unsupported Operation");
-       return hfi1_verbs_send_dma(qp, ps, 0);
-}
-
-/*
- * Handle snooping and capturing packets when pio is being used. Does not handle
- * bypass packets. The only way to send a bypass packet currently is to use the
- * diagpkt interface. When that interface is enable snoop/capture is not.
- */
-int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc)
-{
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 dwords = (len + 3) >> 2;
-       u32 plen = hdrwords + dwords + 2; /* includes pbc */
-       struct hfi1_pportdata *ppd = ps->ppd;
-       struct snoop_packet *s_packet = NULL;
-       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
-       u32 length = 0;
-       struct rvt_sge_state temp_ss;
-       void *data = NULL;
-       void *data_start = NULL;
-       int ret;
-       int snoop_mode = 0;
-       int md_len = 0;
-       struct capture_md md;
-       u32 vl;
-       u32 hdr_len = hdrwords << 2;
-       u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr);
-
-       md.u.pbc = 0;
-
-       snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
-                 hdrwords, len, plen, dwords, tlen);
-       if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-               snoop_mode = 1;
-       if ((snoop_mode == 0) ||
-           unlikely(snoop_flags & SNOOP_USE_METADATA))
-               md_len = sizeof(struct capture_md);
-
-       /* not using ss->total_len as arg 2 b/c that does not count CRC */
-       s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
-
-       if (unlikely(!s_packet)) {
-               dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
-               goto out;
-       }
-
-       s_packet->total_len = tlen + md_len;
-
-       if (md_len > 0) {
-               memset(&md, 0, sizeof(struct capture_md));
-               md.port = 1;
-               md.dir = PKT_DIR_EGRESS;
-               if (likely(pbc == 0)) {
-                       vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12;
-                       md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
-               } else {
-                       md.u.pbc = 0;
-               }
-               memcpy(s_packet->data, &md, md_len);
-       } else {
-               md.u.pbc = pbc;
-       }
-
-       /* Copy header */
-       if (likely(hdr)) {
-               memcpy(s_packet->data + md_len, hdr, hdr_len);
-       } else {
-               dd_dev_err(ppd->dd,
-                          "Unable to copy header to snoop/capture packet\n");
-               kfree(s_packet);
-               goto out;
-       }
-
-       if (ss) {
-               data = s_packet->data + hdr_len + md_len;
-               data_start = data;
-
-               /*
-                * Copy SGE State
-                * The update_sge() function below will not modify the
-                * individual SGEs in the array. It will make a copy each time
-                * and operate on that. So we only need to copy this instance
-                * and it won't impact PIO.
-                */
-               temp_ss = *ss;
-               length = len;
-
-               snoop_dbg("Need to copy %d bytes", length);
-               while (length) {
-                       void *addr = temp_ss.sge.vaddr;
-                       u32 slen = temp_ss.sge.length;
-
-                       if (slen > length) {
-                               slen = length;
-                               snoop_dbg("slen %d > len %d", slen, length);
-                       }
-                       snoop_dbg("copy %d to %p", slen, addr);
-                       memcpy(data, addr, slen);
-                       update_sge(&temp_ss, slen);
-                       length -= slen;
-                       data += slen;
-                       snoop_dbg("data is now %p bytes left %d", data, length);
-               }
-               snoop_dbg("Completed SGE copy");
-       }
-
-       /*
-        * Why do the filter check down here? Because the event tracing has its
-        * own filtering and we need to have the walked the SGE list.
-        */
-       if (!ppd->dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set\n");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = ppd->dd->hfi1_snoop.filter_callback(
-                                       &ps->s_txreq->phdr.hdr,
-                                       NULL,
-                                       ppd->dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               /* fall through */
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               kfree(s_packet);
-               break;
-       case HFI1_FILTER_HIT:
-               snoop_dbg("Capturing packet");
-               snoop_list_add_tail(s_packet, ppd->dd);
-
-               if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
-                            (ppd->dd->hfi1_snoop.mode_flag &
-                             HFI1_PORT_SNOOP_MODE))) {
-                       unsigned long flags;
-
-                       snoop_dbg("Dropping packet");
-                       if (qp->s_wqe) {
-                               spin_lock_irqsave(&qp->s_lock, flags);
-                               hfi1_send_complete(
-                                       qp,
-                                       qp->s_wqe,
-                                       IB_WC_SUCCESS);
-                               spin_unlock_irqrestore(&qp->s_lock, flags);
-                       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-                               spin_lock_irqsave(&qp->s_lock, flags);
-                               hfi1_rc_send_complete(qp,
-                                                     &ps->s_txreq->phdr.hdr);
-                               spin_unlock_irqrestore(&qp->s_lock, flags);
-                       }
-
-                       /*
-                        * If snoop is dropping the packet we need to put the
-                        * txreq back because no one else will.
-                        */
-                       hfi1_put_txreq(ps->s_txreq);
-                       return 0;
-               }
-               break;
-       default:
-               kfree(s_packet);
-               break;
-       }
-out:
-       return hfi1_verbs_send_pio(qp, ps, md.u.pbc);
-}
-
-/*
- * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
- * this can be used anywhere, but the intention is for inline ACKs for RC and
- * CCA packets. We don't restrict this usage though.
- */
-void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                          u64 pbc, const void *from, size_t count)
-{
-       int snoop_mode = 0;
-       int md_len = 0;
-       struct capture_md md;
-       struct snoop_packet *s_packet = NULL;
-
-       /*
-        * count is in dwords so we need to convert to bytes.
-        * We also need to account for CRC which would be tacked on by hardware.
-        */
-       int packet_len = (count << 2) + 4;
-       int ret;
-
-       snoop_dbg("ACK OUT: len %d", packet_len);
-
-       if (!dd->hfi1_snoop.filter_callback) {
-               snoop_dbg("filter not set");
-               ret = HFI1_FILTER_HIT;
-       } else {
-               ret = dd->hfi1_snoop.filter_callback(
-                               (struct hfi1_ib_header *)from,
-                               NULL,
-                               dd->hfi1_snoop.filter_value);
-       }
-
-       switch (ret) {
-       case HFI1_FILTER_ERR:
-               snoop_dbg("Error in filter call");
-               /* fall through */
-       case HFI1_FILTER_MISS:
-               snoop_dbg("Filter Miss");
-               break;
-       case HFI1_FILTER_HIT:
-               snoop_dbg("Capturing packet");
-               if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
-                       snoop_mode = 1;
-               if ((snoop_mode == 0) ||
-                   unlikely(snoop_flags & SNOOP_USE_METADATA))
-                       md_len = sizeof(struct capture_md);
-
-               s_packet = allocate_snoop_packet(packet_len, 0, md_len);
-
-               if (unlikely(!s_packet)) {
-                       dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
-                       goto inline_pio_out;
-               }
-
-               s_packet->total_len = packet_len + md_len;
-
-               /* Fill in the metadata for the packet */
-               if (md_len > 0) {
-                       memset(&md, 0, sizeof(struct capture_md));
-                       md.port = 1;
-                       md.dir = PKT_DIR_EGRESS;
-                       md.u.pbc = pbc;
-                       memcpy(s_packet->data, &md, md_len);
-               }
-
-               /* Add the packet data which is a single buffer */
-               memcpy(s_packet->data + md_len, from, packet_len);
-
-               snoop_list_add_tail(s_packet, dd);
-
-               if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
-                       snoop_dbg("Dropping packet");
-                       return;
-               }
-               break;
-       default:
-               break;
-       }
-
-inline_pio_out:
-       pio_copy(dd, pbuf, pbc, from, count);
-}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
deleted file mode 100644 (file)
index 7e8dab8..0000000
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/types.h>
-#include <linux/scatterlist.h>
-
-#include "verbs.h"
-
-#define BAD_DMA_ADDRESS ((u64)0)
-
-/*
- * The following functions implement driver specific replacements
- * for the ib_dma_*() functions.
- *
- * These functions return kernel virtual addresses instead of
- * device bus addresses since the driver uses the CPU to copy
- * data instead of using hardware DMA.
- */
-
-static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == BAD_DMA_ADDRESS;
-}
-
-static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
-                              size_t size, enum dma_data_direction direction)
-{
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       return (u64)cpu_addr;
-}
-
-static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
-                                 enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
-                            unsigned long offset, size_t size,
-                           enum dma_data_direction direction)
-{
-       u64 addr;
-
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       if (offset + size > PAGE_SIZE)
-               return BAD_DMA_ADDRESS;
-
-       addr = (u64)page_address(page);
-       if (addr)
-               addr += offset;
-
-       return addr;
-}
-
-static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                      int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       u64 addr;
-       int i;
-       int ret = nents;
-
-       if (WARN_ON(!valid_dma_direction(direction)))
-               return BAD_DMA_ADDRESS;
-
-       for_each_sg(sgl, sg, nents, i) {
-               addr = (u64)page_address(sg_page(sg));
-               if (!addr) {
-                       ret = 0;
-                       break;
-               }
-               sg->dma_address = addr + sg->offset;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-               sg->dma_length = sg->length;
-#endif
-       }
-       return ret;
-}
-
-static void hfi1_unmap_sg(struct ib_device *dev,
-                         struct scatterlist *sg, int nents,
-                        enum dma_data_direction direction)
-{
-       /* This is a stub, nothing to be done here */
-}
-
-static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
-                                    size_t size, enum dma_data_direction dir)
-{
-}
-
-static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
-                                       size_t size,
-                                       enum dma_data_direction dir)
-{
-}
-
-static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                    u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p)
-               addr = page_address(p);
-       if (dma_handle)
-               *dma_handle = (u64)addr;
-       return addr;
-}
-
-static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
-                                  void *cpu_addr, u64 dma_handle)
-{
-       free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
-       .mapping_error = hfi1_mapping_error,
-       .map_single = hfi1_dma_map_single,
-       .unmap_single = hfi1_dma_unmap_single,
-       .map_page = hfi1_dma_map_page,
-       .unmap_page = hfi1_dma_unmap_page,
-       .map_sg = hfi1_map_sg,
-       .unmap_sg = hfi1_unmap_sg,
-       .sync_single_for_cpu = hfi1_sync_single_for_cpu,
-       .sync_single_for_device = hfi1_sync_single_for_device,
-       .alloc_coherent = hfi1_dma_alloc_coherent,
-       .free_coherent = hfi1_dma_free_coherent
-};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
deleted file mode 100644 (file)
index 700c6fa..0000000
+++ /dev/null
@@ -1,1404 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/prefetch.h>
-#include <rdma/ib_verbs.h>
-
-#include "hfi.h"
-#include "trace.h"
-#include "qp.h"
-#include "sdma.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-/*
- * The size has to be longer than this string, so we can append
- * board/chip information to it in the initialization code.
- */
-const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
-
-DEFINE_SPINLOCK(hfi1_devs_lock);
-LIST_HEAD(hfi1_dev_list);
-DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
-
-unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
-module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
-                HFI1_DEFAULT_MAX_MTU));
-
-unsigned int hfi1_cu = 1;
-module_param_named(cu, hfi1_cu, uint, S_IRUGO);
-MODULE_PARM_DESC(cu, "Credit return units");
-
-unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
-static int hfi1_caps_set(const char *, const struct kernel_param *);
-static int hfi1_caps_get(char *, const struct kernel_param *);
-static const struct kernel_param_ops cap_ops = {
-       .set = hfi1_caps_set,
-       .get = hfi1_caps_get
-};
-module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
-MODULE_VERSION(HFI1_DRIVER_VERSION);
-
-/*
- * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
- */
-#define MAX_PKT_RECV 64
-#define EGR_HEAD_UPDATE_THRESHOLD 16
-
-struct hfi1_ib_stats hfi1_stats;
-
-static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
-{
-       int ret = 0;
-       unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
-               cap_mask = *cap_mask_ptr, value, diff,
-               write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
-                             HFI1_CAP_WRITABLE_MASK);
-
-       ret = kstrtoul(val, 0, &value);
-       if (ret) {
-               pr_warn("Invalid module parameter value for 'cap_mask'\n");
-               goto done;
-       }
-       /* Get the changed bits (except the locked bit) */
-       diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
-
-       /* Remove any bits that are not allowed to change after driver load */
-       if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
-               pr_warn("Ignoring non-writable capability bits %#lx\n",
-                       diff & ~write_mask);
-               diff &= write_mask;
-       }
-
-       /* Mask off any reserved bits */
-       diff &= ~HFI1_CAP_RESERVED_MASK;
-       /* Clear any previously set and changing bits */
-       cap_mask &= ~diff;
-       /* Update the bits with the new capability */
-       cap_mask |= (value & diff);
-       /* Check for any kernel/user restrictions */
-       diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
-               ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
-       cap_mask &= ~diff;
-       /* Set the bitmask to the final set */
-       *cap_mask_ptr = cap_mask;
-done:
-       return ret;
-}
-
-static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
-{
-       unsigned long cap_mask = *(unsigned long *)kp->arg;
-
-       cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
-       cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
-
-       return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
-}
-
-const char *get_unit_name(int unit)
-{
-       static char iname[16];
-
-       snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
-       return iname;
-}
-
-const char *get_card_name(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
-       struct hfi1_devdata *dd = container_of(ibdev,
-                                              struct hfi1_devdata, verbs_dev);
-       return get_unit_name(dd->unit);
-}
-
-struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
-       struct hfi1_devdata *dd = container_of(ibdev,
-                                              struct hfi1_devdata, verbs_dev);
-       return dd->pcidev;
-}
-
-/*
- * Return count of units with at least one port ACTIVE.
- */
-int hfi1_count_active_units(void)
-{
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       unsigned long flags;
-       int pidx, nunits_active = 0;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
-                       continue;
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-                       if (ppd->lid && ppd->linkup) {
-                               nunits_active++;
-                               break;
-                       }
-               }
-       }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       return nunits_active;
-}
-
-/*
- * Return count of all units, optionally return in arguments
- * the number of usable (present) units, and the number of
- * ports that are up.
- */
-int hfi1_count_units(int *npresentp, int *nupp)
-{
-       int nunits = 0, npresent = 0, nup = 0;
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       int pidx;
-       struct hfi1_pportdata *ppd;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
-               nunits++;
-               if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
-                       npresent++;
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-                       if (ppd->lid && ppd->linkup)
-                               nup++;
-               }
-       }
-
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-       if (npresentp)
-               *npresentp = npresent;
-       if (nupp)
-               *nupp = nup;
-
-       return nunits;
-}
-
-/*
- * Get address of eager buffer from it's index (allocated in chunks, not
- * contiguous).
- */
-static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
-                              u8 *update)
-{
-       u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
-
-       *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
-       return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
-                       (offset * RCV_BUF_BLOCK_SIZE));
-}
-
-/*
- * Validate and encode the a given RcvArray Buffer size.
- * The function will check whether the given size falls within
- * allowed size ranges for the respective type and, optionally,
- * return the proper encoding.
- */
-inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
-{
-       if (unlikely(!PAGE_ALIGNED(size)))
-               return 0;
-       if (unlikely(size < MIN_EAGER_BUFFER))
-               return 0;
-       if (size >
-           (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
-               return 0;
-       if (encoded)
-               *encoded = ilog2(size / PAGE_SIZE) + 1;
-       return 1;
-}
-
-static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
-                      struct hfi1_packet *packet)
-{
-       struct hfi1_message_header *rhdr = packet->hdr;
-       u32 rte = rhf_rcv_type_err(packet->rhf);
-       int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       struct hfi1_devdata *dd = ppd->dd;
-       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-
-       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
-               return;
-
-       if (packet->rhf & RHF_TID_ERR) {
-               /* For TIDERR and RC QPs preemptively schedule a NAK */
-               struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
-               struct hfi1_other_headers *ohdr = NULL;
-               u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
-               u16 lid  = be16_to_cpu(hdr->lrh[1]);
-               u32 qp_num;
-               u32 rcv_flags = 0;
-
-               /* Sanity check packet */
-               if (tlen < 24)
-                       goto drop;
-
-               /* Check for GRH */
-               if (lnh == HFI1_LRH_BTH) {
-                       ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
-                       u32 vtf;
-
-                       ohdr = &hdr->u.l.oth;
-                       if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
-                               goto drop;
-                       vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
-                       if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
-                               goto drop;
-                       rcv_flags |= HFI1_HAS_GRH;
-               } else {
-                       goto drop;
-               }
-               /* Get the destination QP number. */
-               qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-                       struct rvt_qp *qp;
-                       unsigned long flags;
-
-                       rcu_read_lock();
-                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
-                       if (!qp) {
-                               rcu_read_unlock();
-                               goto drop;
-                       }
-
-                       /*
-                        * Handle only RC QPs - for other QP types drop error
-                        * packet.
-                        */
-                       spin_lock_irqsave(&qp->r_lock, flags);
-
-                       /* Check for valid receive state. */
-                       if (!(ib_rvt_state_ops[qp->state] &
-                             RVT_PROCESS_RECV_OK)) {
-                               ibp->rvp.n_pkt_drops++;
-                       }
-
-                       switch (qp->ibqp.qp_type) {
-                       case IB_QPT_RC:
-                               hfi1_rc_hdrerr(
-                                       rcd,
-                                       hdr,
-                                       rcv_flags,
-                                       qp);
-                               break;
-                       default:
-                               /* For now don't handle any other QP types */
-                               break;
-                       }
-
-                       spin_unlock_irqrestore(&qp->r_lock, flags);
-                       rcu_read_unlock();
-               } /* Unicast QP */
-       } /* Valid packet with TIDErr */
-
-       /* handle "RcvTypeErr" flags */
-       switch (rte) {
-       case RHF_RTE_ERROR_OP_CODE_ERR:
-       {
-               u32 opcode;
-               void *ebuf = NULL;
-               __be32 *bth = NULL;
-
-               if (rhf_use_egr_bfr(packet->rhf))
-                       ebuf = packet->ebuf;
-
-               if (!ebuf)
-                       goto drop; /* this should never happen */
-
-               if (lnh == HFI1_LRH_BTH)
-                       bth = (__be32 *)ebuf;
-               else if (lnh == HFI1_LRH_GRH)
-                       bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
-               else
-                       goto drop;
-
-               opcode = be32_to_cpu(bth[0]) >> 24;
-               opcode &= 0xff;
-
-               if (opcode == IB_OPCODE_CNP) {
-                       /*
-                        * Only in pre-B0 h/w is the CNP_OPCODE handled
-                        * via this code path.
-                        */
-                       struct rvt_qp *qp = NULL;
-                       u32 lqpn, rqpn;
-                       u16 rlid;
-                       u8 svc_type, sl, sc5;
-
-                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
-                       if (rhf_dc_info(packet->rhf))
-                               sc5 |= 0x10;
-                       sl = ibp->sc_to_sl[sc5];
-
-                       lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
-                       rcu_read_lock();
-                       qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
-                       if (!qp) {
-                               rcu_read_unlock();
-                               goto drop;
-                       }
-
-                       switch (qp->ibqp.qp_type) {
-                       case IB_QPT_UD:
-                               rlid = 0;
-                               rqpn = 0;
-                               svc_type = IB_CC_SVCTYPE_UD;
-                               break;
-                       case IB_QPT_UC:
-                               rlid = be16_to_cpu(rhdr->lrh[3]);
-                               rqpn = qp->remote_qpn;
-                               svc_type = IB_CC_SVCTYPE_UC;
-                               break;
-                       default:
-                               goto drop;
-                       }
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-                       rcu_read_unlock();
-               }
-
-               packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
-               break;
-       }
-       default:
-               break;
-       }
-
-drop:
-       return;
-}
-
-static inline void init_packet(struct hfi1_ctxtdata *rcd,
-                              struct hfi1_packet *packet)
-{
-       packet->rsize = rcd->rcvhdrqentsize; /* words */
-       packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
-       packet->rcd = rcd;
-       packet->updegr = 0;
-       packet->etail = -1;
-       packet->rhf_addr = get_rhf_addr(rcd);
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-       packet->rhqoff = rcd->head;
-       packet->numpkt = 0;
-       packet->rcv_flags = 0;
-}
-
-static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
-                       struct hfi1_other_headers *ohdr,
-                       u64 rhf, u32 bth1, struct ib_grh *grh)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       u32 rqpn = 0;
-       u16 rlid;
-       u8 sc5, svc_type;
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               rlid = be16_to_cpu(hdr->lrh[3]);
-               rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
-               svc_type = IB_CC_SVCTYPE_UD;
-               break;
-       case IB_QPT_UC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_UC;
-               break;
-       case IB_QPT_RC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_RC;
-               break;
-       default:
-               return;
-       }
-
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       if (rhf_dc_info(rhf))
-               sc5 |= 0x10;
-
-       if (bth1 & HFI1_FECN_SMASK) {
-               u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-               u16 dlid = be16_to_cpu(hdr->lrh[1]);
-
-               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
-       }
-
-       if (bth1 & HFI1_BECN_SMASK) {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn = bth1 & RVT_QPN_MASK;
-               u8 sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-       }
-}
-
-struct ps_mdata {
-       struct hfi1_ctxtdata *rcd;
-       u32 rsize;
-       u32 maxcnt;
-       u32 ps_head;
-       u32 ps_tail;
-       u32 ps_seq;
-};
-
-static inline void init_ps_mdata(struct ps_mdata *mdata,
-                                struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-
-       mdata->rcd = rcd;
-       mdata->rsize = packet->rsize;
-       mdata->maxcnt = packet->maxcnt;
-       mdata->ps_head = packet->rhqoff;
-
-       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-               mdata->ps_tail = get_rcvhdrtail(rcd);
-               if (rcd->ctxt == HFI1_CTRL_CTXT)
-                       mdata->ps_seq = rcd->seq_cnt;
-               else
-                       mdata->ps_seq = 0; /* not used with DMA_RTAIL */
-       } else {
-               mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
-               mdata->ps_seq = rcd->seq_cnt;
-       }
-}
-
-static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
-                         struct hfi1_ctxtdata *rcd)
-{
-       if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
-               return mdata->ps_head == mdata->ps_tail;
-       return mdata->ps_seq != rhf_rcv_seq(rhf);
-}
-
-static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
-                         struct hfi1_ctxtdata *rcd)
-{
-       /*
-        * Control context can potentially receive an invalid rhf.
-        * Drop such packets.
-        */
-       if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
-               return mdata->ps_seq != rhf_rcv_seq(rhf);
-
-       return 0;
-}
-
-static inline void update_ps_mdata(struct ps_mdata *mdata,
-                                  struct hfi1_ctxtdata *rcd)
-{
-       mdata->ps_head += mdata->rsize;
-       if (mdata->ps_head >= mdata->maxcnt)
-               mdata->ps_head = 0;
-
-       /* Control context must do seq counting */
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
-           (rcd->ctxt == HFI1_CTRL_CTXT)) {
-               if (++mdata->ps_seq > 13)
-                       mdata->ps_seq = 1;
-       }
-}
-
-/*
- * prescan_rxq - search through the receive queue looking for packets
- * containing Excplicit Congestion Notifications (FECNs, or BECNs).
- * When an ECN is found, process the Congestion Notification, and toggle
- * it off.
- * This is declared as a macro to allow quick checking of the port to avoid
- * the overhead of a function call if not enabled.
- */
-#define prescan_rxq(rcd, packet) \
-       do { \
-               if (rcd->ppd->cc_prescan) \
-                       __prescan_rxq(packet); \
-       } while (0)
-static void __prescan_rxq(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct ps_mdata mdata;
-
-       init_ps_mdata(&mdata, packet);
-
-       while (1) {
-               struct hfi1_devdata *dd = rcd->dd;
-               struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
-               __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
-                                        dd->rhf_offset;
-               struct rvt_qp *qp;
-               struct hfi1_ib_header *hdr;
-               struct hfi1_other_headers *ohdr;
-               struct ib_grh *grh = NULL;
-               struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-               u64 rhf = rhf_to_cpu(rhf_addr);
-               u32 etype = rhf_rcv_type(rhf), qpn, bth1;
-               int is_ecn = 0;
-               u8 lnh;
-
-               if (ps_done(&mdata, rhf, rcd))
-                       break;
-
-               if (ps_skip(&mdata, rhf, rcd))
-                       goto next;
-
-               if (etype != RHF_RCV_TYPE_IB)
-                       goto next;
-
-               hdr = (struct hfi1_ib_header *)
-                       hfi1_get_msgheader(dd, rhf_addr);
-               lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-
-               if (lnh == HFI1_LRH_BTH) {
-                       ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
-                       ohdr = &hdr->u.l.oth;
-                       grh = &hdr->u.l.grh;
-               } else {
-                       goto next; /* just in case */
-               }
-               bth1 = be32_to_cpu(ohdr->bth[1]);
-               is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
-
-               if (!is_ecn)
-                       goto next;
-
-               qpn = bth1 & RVT_QPN_MASK;
-               rcu_read_lock();
-               qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
-
-               if (!qp) {
-                       rcu_read_unlock();
-                       goto next;
-               }
-
-               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
-               rcu_read_unlock();
-
-               /* turn off BECN, FECN */
-               bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK);
-               ohdr->bth[1] = cpu_to_be32(bth1);
-next:
-               update_ps_mdata(&mdata, rcd);
-       }
-}
-
-static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
-{
-       int ret = RCV_PKT_OK;
-
-       /* Set up for the next packet */
-       packet->rhqoff += packet->rsize;
-       if (packet->rhqoff >= packet->maxcnt)
-               packet->rhqoff = 0;
-
-       packet->numpkt++;
-       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
-               if (thread) {
-                       cond_resched();
-               } else {
-                       ret = RCV_PKT_LIMIT;
-                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
-               }
-       }
-
-       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
-                                    packet->rcd->dd->rhf_offset;
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-
-       return ret;
-}
-
-static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
-{
-       int ret = RCV_PKT_OK;
-
-       packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
-                                        packet->rhf_addr);
-       packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
-       packet->etype = rhf_rcv_type(packet->rhf);
-       /* total length */
-       packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
-       /* retrieve eager buffer details */
-       packet->ebuf = NULL;
-       if (rhf_use_egr_bfr(packet->rhf)) {
-               packet->etail = rhf_egr_index(packet->rhf);
-               packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
-                                &packet->updegr);
-               /*
-                * Prefetch the contents of the eager buffer.  It is
-                * OK to send a negative length to prefetch_range().
-                * The +2 is the size of the RHF.
-                */
-               prefetch_range(packet->ebuf,
-                              packet->tlen - ((packet->rcd->rcvhdrqentsize -
-                                              (rhf_hdrq_offset(packet->rhf)
-                                               + 2)) * 4));
-       }
-
-       /*
-        * Call a type specific handler for the packet. We
-        * should be able to trust that etype won't be beyond
-        * the range of valid indexes. If so something is really
-        * wrong and we can probably just let things come
-        * crashing down. There is no need to eat another
-        * comparison in this performance critical code.
-        */
-       packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
-       packet->numpkt++;
-
-       /* Set up for the next packet */
-       packet->rhqoff += packet->rsize;
-       if (packet->rhqoff >= packet->maxcnt)
-               packet->rhqoff = 0;
-
-       if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
-               if (thread) {
-                       cond_resched();
-               } else {
-                       ret = RCV_PKT_LIMIT;
-                       this_cpu_inc(*packet->rcd->dd->rcv_limit);
-               }
-       }
-
-       packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
-                                     packet->rcd->dd->rhf_offset;
-       packet->rhf = rhf_to_cpu(packet->rhf_addr);
-
-       return ret;
-}
-
-static inline void process_rcv_update(int last, struct hfi1_packet *packet)
-{
-       /*
-        * Update head regs etc., every 16 packets, if not last pkt,
-        * to help prevent rcvhdrq overflows, when many packets
-        * are processed and queue is nearly full.
-        * Don't request an interrupt for intermediate updates.
-        */
-       if (!last && !(packet->numpkt & 0xf)) {
-               update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
-                              packet->etail, 0, 0);
-               packet->updegr = 0;
-       }
-       packet->rcv_flags = 0;
-}
-
-static inline void finish_packet(struct hfi1_packet *packet)
-{
-       /*
-        * Nothing we need to free for the packet.
-        *
-        * The only thing we need to do is a final update and call for an
-        * interrupt
-        */
-       update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
-                      packet->etail, rcv_intr_dynamic, packet->numpkt);
-}
-
-static inline void process_rcv_qp_work(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd;
-       struct rvt_qp *qp, *nqp;
-
-       rcd = packet->rcd;
-       rcd->head = packet->rhqoff;
-
-       /*
-        * Iterate over all QPs waiting to respond.
-        * The list won't change since the IRQ is only run on one CPU.
-        */
-       list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
-               list_del_init(&qp->rspwait);
-               if (qp->r_flags & RVT_R_RSP_NAK) {
-                       qp->r_flags &= ~RVT_R_RSP_NAK;
-                       hfi1_send_rc_ack(rcd, qp, 0);
-               }
-               if (qp->r_flags & RVT_R_RSP_SEND) {
-                       unsigned long flags;
-
-                       qp->r_flags &= ~RVT_R_RSP_SEND;
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       if (ib_rvt_state_ops[qp->state] &
-                                       RVT_PROCESS_OR_FLUSH_SEND)
-                               hfi1_schedule_send(qp);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-               }
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-}
-
-/*
- * Handle receive interrupts when using the no dma rtail option.
- */
-int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
-{
-       u32 seq;
-       int last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-
-       init_packet(rcd, &packet);
-       seq = rhf_rcv_seq(packet.rhf);
-       if (seq != rcd->seq_cnt) {
-               last = RCV_PKT_DONE;
-               goto bail;
-       }
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               last = process_rcv_packet(&packet, thread);
-               seq = rhf_rcv_seq(packet.rhf);
-               if (++rcd->seq_cnt > 13)
-                       rcd->seq_cnt = 1;
-               if (seq != rcd->seq_cnt)
-                       last = RCV_PKT_DONE;
-               process_rcv_update(last, &packet);
-       }
-       process_rcv_qp_work(&packet);
-bail:
-       finish_packet(&packet);
-       return last;
-}
-
-int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
-{
-       u32 hdrqtail;
-       int last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-
-       init_packet(rcd, &packet);
-       hdrqtail = get_rcvhdrtail(rcd);
-       if (packet.rhqoff == hdrqtail) {
-               last = RCV_PKT_DONE;
-               goto bail;
-       }
-       smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               last = process_rcv_packet(&packet, thread);
-               if (packet.rhqoff == hdrqtail)
-                       last = RCV_PKT_DONE;
-               process_rcv_update(last, &packet);
-       }
-       process_rcv_qp_work(&packet);
-bail:
-       finish_packet(&packet);
-       return last;
-}
-
-static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt =
-                       &handle_receive_interrupt_nodma_rtail;
-}
-
-static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt =
-                       &handle_receive_interrupt_dma_rtail;
-}
-
-void set_all_slowpath(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
-       for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
-               dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
-}
-
-static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
-                                     struct hfi1_packet packet,
-                                     struct hfi1_devdata *dd)
-{
-       struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
-       struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd,
-                                                            packet.rhf_addr);
-
-       if (hdr2sc(hdr, packet.rhf) != 0xf) {
-               int hwstate = read_logical_state(dd);
-
-               if (hwstate != LSTATE_ACTIVE) {
-                       dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
-                       return 0;
-               }
-
-               queue_work(rcd->ppd->hfi1_wq, lsaw);
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * handle_receive_interrupt - receive a packet
- * @rcd: the context
- *
- * Called from interrupt handler for errors or receive interrupt.
- * This is the slow path interrupt handler.
- */
-int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 hdrqtail;
-       int needset, last = RCV_PKT_OK;
-       struct hfi1_packet packet;
-       int skip_pkt = 0;
-
-       /* Control context will always use the slow path interrupt handler */
-       needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
-
-       init_packet(rcd, &packet);
-
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-               u32 seq = rhf_rcv_seq(packet.rhf);
-
-               if (seq != rcd->seq_cnt) {
-                       last = RCV_PKT_DONE;
-                       goto bail;
-               }
-               hdrqtail = 0;
-       } else {
-               hdrqtail = get_rcvhdrtail(rcd);
-               if (packet.rhqoff == hdrqtail) {
-                       last = RCV_PKT_DONE;
-                       goto bail;
-               }
-               smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
-
-               /*
-                * Control context can potentially receive an invalid
-                * rhf. Drop such packets.
-                */
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       u32 seq = rhf_rcv_seq(packet.rhf);
-
-                       if (seq != rcd->seq_cnt)
-                               skip_pkt = 1;
-               }
-       }
-
-       prescan_rxq(rcd, &packet);
-
-       while (last == RCV_PKT_OK) {
-               if (unlikely(dd->do_drop &&
-                            atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
-                            DROP_PACKET_ON)) {
-                       dd->do_drop = 0;
-
-                       /* On to the next packet */
-                       packet.rhqoff += packet.rsize;
-                       packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
-                                         packet.rhqoff +
-                                         dd->rhf_offset;
-                       packet.rhf = rhf_to_cpu(packet.rhf_addr);
-
-               } else if (skip_pkt) {
-                       last = skip_rcv_packet(&packet, thread);
-                       skip_pkt = 0;
-               } else {
-                       /* Auto activate link on non-SC15 packet receive */
-                       if (unlikely(rcd->ppd->host_link_state ==
-                                    HLS_UP_ARMED) &&
-                           set_armed_to_active(rcd, packet, dd))
-                               goto bail;
-                       last = process_rcv_packet(&packet, thread);
-               }
-
-               if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-                       u32 seq = rhf_rcv_seq(packet.rhf);
-
-                       if (++rcd->seq_cnt > 13)
-                               rcd->seq_cnt = 1;
-                       if (seq != rcd->seq_cnt)
-                               last = RCV_PKT_DONE;
-                       if (needset) {
-                               dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
-                               set_all_nodma_rtail(dd);
-                               needset = 0;
-                       }
-               } else {
-                       if (packet.rhqoff == hdrqtail)
-                               last = RCV_PKT_DONE;
-                       /*
-                        * Control context can potentially receive an invalid
-                        * rhf. Drop such packets.
-                        */
-                       if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                               u32 seq = rhf_rcv_seq(packet.rhf);
-
-                               if (++rcd->seq_cnt > 13)
-                                       rcd->seq_cnt = 1;
-                               if (!last && (seq != rcd->seq_cnt))
-                                       skip_pkt = 1;
-                       }
-
-                       if (needset) {
-                               dd_dev_info(dd,
-                                           "Switching to DMA_RTAIL\n");
-                               set_all_dma_rtail(dd);
-                               needset = 0;
-                       }
-               }
-
-               process_rcv_update(last, &packet);
-       }
-
-       process_rcv_qp_work(&packet);
-
-bail:
-       /*
-        * Always write head at end, and setup rcv interrupt, even
-        * if no packets were processed.
-        */
-       finish_packet(&packet);
-       return last;
-}
-
-/*
- * We may discover in the interrupt that the hardware link state has
- * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
- * and we need to update the driver's notion of the link state.  We cannot
- * run set_link_state from interrupt context, so we queue this function on
- * a workqueue.
- *
- * We delay the regular interrupt processing until after the state changes
- * so that the link will be in the correct state by the time any application
- * we wake up attempts to send a reply to any message it received.
- * (Subsequent receive interrupts may possibly force the wakeup before we
- * update the link state.)
- *
- * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
- * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
- * so we're safe from use-after-free of the rcd.
- */
-void receive_interrupt_work(struct work_struct *work)
-{
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                 linkstate_active_work);
-       struct hfi1_devdata *dd = ppd->dd;
-       int i;
-
-       /* Received non-SC15 packet implies neighbor_normal */
-       ppd->neighbor_normal = 1;
-       set_link_state(ppd, HLS_UP_ACTIVE);
-
-       /*
-        * Interrupt all kernel contexts that could have had an
-        * interrupt during auto activation.
-        */
-       for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
-               force_recv_intr(dd->rcd[i]);
-}
-
-/*
- * Convert a given MTU size to the on-wire MAD packet enumeration.
- * Return -1 if the size is invalid.
- */
-int mtu_to_enum(u32 mtu, int default_if_bad)
-{
-       switch (mtu) {
-       case     0: return OPA_MTU_0;
-       case   256: return OPA_MTU_256;
-       case   512: return OPA_MTU_512;
-       case  1024: return OPA_MTU_1024;
-       case  2048: return OPA_MTU_2048;
-       case  4096: return OPA_MTU_4096;
-       case  8192: return OPA_MTU_8192;
-       case 10240: return OPA_MTU_10240;
-       }
-       return default_if_bad;
-}
-
-u16 enum_to_mtu(int mtu)
-{
-       switch (mtu) {
-       case OPA_MTU_0:     return 0;
-       case OPA_MTU_256:   return 256;
-       case OPA_MTU_512:   return 512;
-       case OPA_MTU_1024:  return 1024;
-       case OPA_MTU_2048:  return 2048;
-       case OPA_MTU_4096:  return 4096;
-       case OPA_MTU_8192:  return 8192;
-       case OPA_MTU_10240: return 10240;
-       default: return 0xffff;
-       }
-}
-
-/*
- * set_mtu - set the MTU
- * @ppd: the per port data
- *
- * We can handle "any" incoming size, the issue here is whether we
- * need to restrict our outgoing size.  We do not deal with what happens
- * to programs that are already running when the size changes.
- */
-int set_mtu(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int i, drain, ret = 0, is_up = 0;
-
-       ppd->ibmtu = 0;
-       for (i = 0; i < ppd->vls_supported; i++)
-               if (ppd->ibmtu < dd->vld[i].mtu)
-                       ppd->ibmtu = dd->vld[i].mtu;
-       ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
-
-       mutex_lock(&ppd->hls_lock);
-       if (ppd->host_link_state == HLS_UP_INIT ||
-           ppd->host_link_state == HLS_UP_ARMED ||
-           ppd->host_link_state == HLS_UP_ACTIVE)
-               is_up = 1;
-
-       drain = !is_ax(dd) && is_up;
-
-       if (drain)
-               /*
-                * MTU is specified per-VL. To ensure that no packet gets
-                * stuck (due, e.g., to the MTU for the packet's VL being
-                * reduced), empty the per-VL FIFOs before adjusting MTU.
-                */
-               ret = stop_drain_data_vls(dd);
-
-       if (ret) {
-               dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
-                          __func__);
-               goto err;
-       }
-
-       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
-
-       if (drain)
-               open_fill_data_vls(dd); /* reopen all VLs */
-
-err:
-       mutex_unlock(&ppd->hls_lock);
-
-       return ret;
-}
-
-int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       ppd->lid = lid;
-       ppd->lmc = lmc;
-       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
-
-       dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
-
-       return 0;
-}
-
-void shutdown_led_override(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       if (atomic_read(&ppd->led_override_timer_active)) {
-               del_timer_sync(&ppd->led_override_timer);
-               atomic_set(&ppd->led_override_timer_active, 0);
-               /* Ensure the atomic_set is visible to all CPUs */
-               smp_wmb();
-       }
-
-       /* Hand control of the LED to the DC for normal operation */
-       write_csr(dd, DCC_CFG_LED_CNTRL, 0);
-}
-
-static void run_led_override(unsigned long opaque)
-{
-       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
-       struct hfi1_devdata *dd = ppd->dd;
-       unsigned long timeout;
-       int phase_idx;
-
-       if (!(dd->flags & HFI1_INITTED))
-               return;
-
-       phase_idx = ppd->led_override_phase & 1;
-
-       setextled(dd, phase_idx);
-
-       timeout = ppd->led_override_vals[phase_idx];
-
-       /* Set up for next phase */
-       ppd->led_override_phase = !ppd->led_override_phase;
-
-       mod_timer(&ppd->led_override_timer, jiffies + timeout);
-}
-
-/*
- * To have the LED blink in a particular pattern, provide timeon and timeoff
- * in milliseconds.
- * To turn off custom blinking and return to normal operation, use
- * shutdown_led_override()
- */
-void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
-                            unsigned int timeoff)
-{
-       if (!(ppd->dd->flags & HFI1_INITTED))
-               return;
-
-       /* Convert to jiffies for direct use in timer */
-       ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
-       ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
-
-       /* Arbitrarily start from LED on phase */
-       ppd->led_override_phase = 1;
-
-       /*
-        * If the timer has not already been started, do so. Use a "quick"
-        * timeout so the handler will be called soon to look at our request.
-        */
-       if (!timer_pending(&ppd->led_override_timer)) {
-               setup_timer(&ppd->led_override_timer, run_led_override,
-                           (unsigned long)ppd);
-               ppd->led_override_timer.expires = jiffies + 1;
-               add_timer(&ppd->led_override_timer);
-               atomic_set(&ppd->led_override_timer_active, 1);
-               /* Ensure the atomic_set is visible to all CPUs */
-               smp_wmb();
-       }
-}
-
-/**
- * hfi1_reset_device - reset the chip if possible
- * @unit: the device to reset
- *
- * Whether or not reset is successful, we attempt to re-initialize the chip
- * (that is, much like a driver unload/reload).  We clear the INITTED flag
- * so that the various entry points will fail until we reinitialize.  For
- * now, we only allow this if no user contexts are open that use chip resources
- */
-int hfi1_reset_device(int unit)
-{
-       int ret, i;
-       struct hfi1_devdata *dd = hfi1_lookup(unit);
-       struct hfi1_pportdata *ppd;
-       unsigned long flags;
-       int pidx;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       dd_dev_info(dd, "Reset on unit %u requested\n", unit);
-
-       if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
-               dd_dev_info(dd,
-                           "Invalid unit number %u or not initialized or not present\n",
-                           unit);
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       if (dd->rcd)
-               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
-                       if (!dd->rcd[i] || !dd->rcd[i]->cnt)
-                               continue;
-                       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-                       ret = -EBUSY;
-                       goto bail;
-               }
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               shutdown_led_override(ppd);
-       }
-       if (dd->flags & HFI1_HAS_SEND_DMA)
-               sdma_exit(dd);
-
-       hfi1_reset_cpu_counters(dd);
-
-       ret = hfi1_init(dd, 1);
-
-       if (ret)
-               dd_dev_err(dd,
-                          "Reinitialize unit %u after reset failed with %d\n",
-                          unit, ret);
-       else
-               dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
-                           unit);
-
-bail:
-       return ret;
-}
-
-void handle_eflags(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       u32 rte = rhf_rcv_type_err(packet->rhf);
-
-       rcv_hdrerr(rcd, rcd->ppd, packet);
-       if (rhf_err_flags(packet->rhf))
-               dd_dev_err(rcd->dd,
-                          "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
-                          rcd->ctxt, packet->rhf,
-                          packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
-                          packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
-                          packet->rhf & RHF_DC_ERR ? "dc " : "",
-                          packet->rhf & RHF_TID_ERR ? "tid " : "",
-                          packet->rhf & RHF_LEN_ERR ? "len " : "",
-                          packet->rhf & RHF_ECC_ERR ? "ecc " : "",
-                          packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
-                          packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
-                          rte);
-}
-
-/*
- * The following functions are called by the interrupt handler. They are type
- * specific handlers for each packet type.
- */
-int process_receive_ib(struct hfi1_packet *packet)
-{
-       trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
-                         packet->rcd->ctxt,
-                         rhf_err_flags(packet->rhf),
-                         RHF_RCV_TYPE_IB,
-                         packet->hlen,
-                         packet->tlen,
-                         packet->updegr,
-                         rhf_egr_index(packet->rhf));
-
-       if (unlikely(rhf_err_flags(packet->rhf))) {
-               handle_eflags(packet);
-               return RHF_RCV_CONTINUE;
-       }
-
-       hfi1_ib_rcv(packet);
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_bypass(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Bypass packets are not supported in normal operation. Dropping\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_error(struct hfi1_packet *packet)
-{
-       handle_eflags(packet);
-
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               dd_dev_err(packet->rcd->dd,
-                          "Unhandled error packet received. Dropping.\n");
-
-       return RHF_RCV_CONTINUE;
-}
-
-int kdeth_process_expected(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Unhandled expected packet received. Dropping.\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int kdeth_process_eager(struct hfi1_packet *packet)
-{
-       if (unlikely(rhf_err_flags(packet->rhf)))
-               handle_eflags(packet);
-
-       dd_dev_err(packet->rcd->dd,
-                  "Unhandled eager packet received. Dropping.\n");
-       return RHF_RCV_CONTINUE;
-}
-
-int process_receive_invalid(struct hfi1_packet *packet)
-{
-       dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
-                  rhf_rcv_type(packet->rhf));
-       return RHF_RCV_CONTINUE;
-}
diff --git a/drivers/staging/rdma/hfi1/efivar.c b/drivers/staging/rdma/hfi1/efivar.c
deleted file mode 100644 (file)
index 106349f..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "efivar.h"
-
-/* GUID for HFI1 variables in EFI */
-#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
-               0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
-/* largest EFI data size we expect */
-#define EFI_DATA_SIZE 4096
-
-/*
- * Read the named EFI variable.  Return the size of the actual data in *size
- * and a kmalloc'ed buffer in *return_data.  The caller must free the
- * data.  It is guaranteed that *return_data will be NULL and *size = 0
- * if this routine fails.
- *
- * Return 0 on success, -errno on failure.
- */
-static int read_efi_var(const char *name, unsigned long *size,
-                       void **return_data)
-{
-       efi_status_t status;
-       efi_char16_t *uni_name;
-       efi_guid_t guid;
-       unsigned long temp_size;
-       void *temp_buffer;
-       void *data;
-       int i;
-       int ret;
-
-       /* set failure return values */
-       *size = 0;
-       *return_data = NULL;
-
-       if (!efi_enabled(EFI_RUNTIME_SERVICES))
-               return -EOPNOTSUPP;
-
-       uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
-       temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
-
-       if (!uni_name || !temp_buffer) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-
-       /* input: the size of the buffer */
-       temp_size = EFI_DATA_SIZE;
-
-       /* convert ASCII to unicode - it is a 1:1 mapping */
-       for (i = 0; name[i]; i++)
-               uni_name[i] = name[i];
-
-       /* need a variable for our GUID */
-       guid = HFI1_EFIVAR_GUID;
-
-       /* call into EFI runtime services */
-       status = efi.get_variable(
-                       uni_name,
-                       &guid,
-                       NULL,
-                       &temp_size,
-                       temp_buffer);
-
-       /*
-        * It would be nice to call efi_status_to_err() here, but that
-        * is in the EFIVAR_FS code and may not be compiled in.
-        * However, even that is insufficient since it does not cover
-        * EFI_BUFFER_TOO_SMALL which could be an important return.
-        * For now, just split out succces or not found.
-        */
-       ret = status == EFI_SUCCESS   ? 0 :
-             status == EFI_NOT_FOUND ? -ENOENT :
-                                       -EINVAL;
-       if (ret)
-               goto fail;
-
-       /*
-        * We have successfully read the EFI variable into our
-        * temporary buffer.  Now allocate a correctly sized
-        * buffer.
-        */
-       data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
-       if (!data) {
-               ret = -ENOMEM;
-               goto fail;
-       }
-
-       *size = temp_size;
-       *return_data = data;
-
-fail:
-       kfree(uni_name);
-       kfree(temp_buffer);
-
-       return ret;
-}
-
-/*
- * Read an HFI1 EFI variable of the form:
- *     <PCIe address>-<kind>
- * Return an kalloc'ed array and size of the data.
- *
- * Returns 0 on success, -errno on failure.
- */
-int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
-                     unsigned long *size, void **return_data)
-{
-       char name[64];
-
-       /* create a common prefix */
-       snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s",
-                pci_domain_nr(dd->pcidev->bus),
-                dd->pcidev->bus->number,
-                PCI_SLOT(dd->pcidev->devfn),
-                PCI_FUNC(dd->pcidev->devfn),
-                kind);
-
-       return read_efi_var(name, size, return_data);
-}
diff --git a/drivers/staging/rdma/hfi1/efivar.h b/drivers/staging/rdma/hfi1/efivar.h
deleted file mode 100644 (file)
index 94e9e70..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_EFIVAR_H
-#define _HFI1_EFIVAR_H
-
-#include <linux/efi.h>
-
-#include "hfi.h"
-
-int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
-                     unsigned long *size, void **return_data);
-
-#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
deleted file mode 100644 (file)
index bd87715..0000000
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/delay.h>
-#include "hfi.h"
-#include "common.h"
-#include "eprom.h"
-
-/*
- * The EPROM is logically divided into three partitions:
- *     partition 0: the first 128K, visible from PCI ROM BAR
- *     partition 1: 4K config file (sector size)
- *     partition 2: the rest
- */
-#define P0_SIZE (128 * 1024)
-#define P1_SIZE   (4 * 1024)
-#define P1_START P0_SIZE
-#define P2_START (P0_SIZE + P1_SIZE)
-
-/* erase sizes supported by the controller */
-#define SIZE_4KB (4 * 1024)
-#define MASK_4KB (SIZE_4KB - 1)
-
-#define SIZE_32KB (32 * 1024)
-#define MASK_32KB (SIZE_32KB - 1)
-
-#define SIZE_64KB (64 * 1024)
-#define MASK_64KB (SIZE_64KB - 1)
-
-/* controller page size, in bytes */
-#define EP_PAGE_SIZE 256
-#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
-
-/* controller commands */
-#define CMD_SHIFT 24
-#define CMD_NOP                            (0)
-#define CMD_PAGE_PROGRAM(addr)     ((0x02 << CMD_SHIFT) | addr)
-#define CMD_READ_DATA(addr)        ((0x03 << CMD_SHIFT) | addr)
-#define CMD_READ_SR1               ((0x05 << CMD_SHIFT))
-#define CMD_WRITE_ENABLE           ((0x06 << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_4KB(addr)  ((0x20 << CMD_SHIFT) | addr)
-#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
-#define CMD_CHIP_ERASE             ((0x60 << CMD_SHIFT))
-#define CMD_READ_MANUF_DEV_ID      ((0x90 << CMD_SHIFT))
-#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
-#define CMD_SECTOR_ERASE_64KB(addr) ((0xd8 << CMD_SHIFT) | addr)
-
-/* controller interface speeds */
-#define EP_SPEED_FULL 0x2      /* full speed */
-
-/* controller status register 1 bits */
-#define SR1_BUSY 0x1ull                /* the BUSY bit in SR1 */
-
-/* sleep length while waiting for controller */
-#define WAIT_SLEEP_US 100      /* must be larger than 5 (see usage) */
-#define COUNT_DELAY_SEC(n) ((n) * (1000000 / WAIT_SLEEP_US))
-
-/* GPIO pins */
-#define EPROM_WP_N BIT_ULL(14) /* EPROM write line */
-
-/*
- * How long to wait for the EPROM to become available, in ms.
- * The spec 32 Mb EPROM takes around 40s to erase then write.
- * Double it for safety.
- */
-#define EPROM_TIMEOUT 80000 /* ms */
-
-/*
- * Turn on external enable line that allows writing on the flash.
- */
-static void write_enable(struct hfi1_devdata *dd)
-{
-       /* raise signal */
-       write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
-       /* raise enable */
-       write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
-}
-
-/*
- * Turn off external enable line that allows writing on the flash.
- */
-static void write_disable(struct hfi1_devdata *dd)
-{
-       /* lower signal */
-       write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
-       /* lower enable */
-       write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
-}
-
-/*
- * Wait for the device to become not busy.  Must be called after all
- * write or erase operations.
- */
-static int wait_for_not_busy(struct hfi1_devdata *dd)
-{
-       unsigned long count = 0;
-       u64 reg;
-       int ret = 0;
-
-       /* starts page mode */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
-       while (1) {
-               udelay(WAIT_SLEEP_US);
-               usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
-               count++;
-               reg = read_csr(dd, ASIC_EEP_DATA);
-               if ((reg & SR1_BUSY) == 0)
-                       break;
-               /* 200s is the largest time for a 128Mb device */
-               if (count > COUNT_DELAY_SEC(200)) {
-                       dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
-                       ret = -ETIMEDOUT;
-                       break; /* break, not goto - must stop page mode */
-               }
-       }
-
-       /* stop page mode with a NOP */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
-
-       return ret;
-}
-
-/*
- * Read the device ID from the SPI controller.
- */
-static u32 read_device_id(struct hfi1_devdata *dd)
-{
-       /* read the Manufacture Device ID */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
-       return (u32)read_csr(dd, ASIC_EEP_DATA);
-}
-
-/*
- * Erase the whole flash.
- */
-static int erase_chip(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       write_enable(dd);
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
-       ret = wait_for_not_busy(dd);
-
-       write_disable(dd);
-
-       return ret;
-}
-
-/*
- * Erase a range.
- */
-static int erase_range(struct hfi1_devdata *dd, u32 start, u32 len)
-{
-       u32 end = start + len;
-       int ret = 0;
-
-       if (end < start)
-               return -EINVAL;
-
-       /* check the end points for the minimum erase */
-       if ((start & MASK_4KB) || (end & MASK_4KB)) {
-               dd_dev_err(dd,
-                          "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n",
-                          __func__, start, end);
-               return -EINVAL;
-       }
-
-       write_enable(dd);
-
-       while (start < end) {
-               write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-               /* check in order of largest to smallest */
-               if (((start & MASK_64KB) == 0) && (start + SIZE_64KB <= end)) {
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_64KB(start));
-                       start += SIZE_64KB;
-               } else if (((start & MASK_32KB) == 0) &&
-                          (start + SIZE_32KB <= end)) {
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_32KB(start));
-                       start += SIZE_32KB;
-               } else {        /* 4KB will work */
-                       write_csr(dd, ASIC_EEP_ADDR_CMD,
-                                 CMD_SECTOR_ERASE_4KB(start));
-                       start += SIZE_4KB;
-               }
-               ret = wait_for_not_busy(dd);
-               if (ret)
-                       goto done;
-       }
-
-done:
-       write_disable(dd);
-
-       return ret;
-}
-
-/*
- * Read a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
-{
-       int i;
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
-       for (i = 0; i < EP_PAGE_SIZE / sizeof(u32); i++)
-               result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
-}
-
-/*
- * Read length bytes starting at offset.  Copy to user address addr.
- */
-static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-       u32 offset;
-       u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-       int ret = 0;
-
-       /* reject anything not on an EPROM page boundary */
-       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-               return -EINVAL;
-
-       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-               read_page(dd, start + offset, buffer);
-               if (copy_to_user((void __user *)(addr + offset),
-                                buffer, EP_PAGE_SIZE)) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-/*
- * Write a 256 byte (64 dword) EPROM page.
- * All callers have verified the offset is at a page boundary.
- */
-static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
-{
-       int i;
-
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
-       write_csr(dd, ASIC_EEP_DATA, data[0]);
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
-       for (i = 1; i < EP_PAGE_SIZE / sizeof(u32); i++)
-               write_csr(dd, ASIC_EEP_DATA, data[i]);
-       /* will close the open page */
-       return wait_for_not_busy(dd);
-}
-
-/*
- * Write length bytes starting at offset.  Read from user address addr.
- */
-static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
-{
-       u32 offset;
-       u32 buffer[EP_PAGE_SIZE / sizeof(u32)];
-       int ret = 0;
-
-       /* reject anything not on an EPROM page boundary */
-       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
-               return -EINVAL;
-
-       write_enable(dd);
-
-       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
-               if (copy_from_user(buffer, (void __user *)(addr + offset),
-                                  EP_PAGE_SIZE)) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-               ret = write_page(dd, start + offset, buffer);
-               if (ret)
-                       goto done;
-       }
-
-done:
-       write_disable(dd);
-       return ret;
-}
-
-/* convert an range composite to a length, in bytes */
-static inline u32 extract_rlen(u32 composite)
-{
-       return (composite & 0xffff) * EP_PAGE_SIZE;
-}
-
-/* convert an range composite to a start, in bytes */
-static inline u32 extract_rstart(u32 composite)
-{
-       return (composite >> 16) * EP_PAGE_SIZE;
-}
-
-/*
- * Perform the given operation on the EPROM.  Called from user space.  The
- * user credentials have already been checked.
- *
- * Return 0 on success, -ERRNO on error
- */
-int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd)
-{
-       struct hfi1_devdata *dd;
-       u32 dev_id;
-       u32 rlen;       /* range length */
-       u32 rstart;     /* range start */
-       int i_minor;
-       int ret = 0;
-
-       /*
-        * Map the device file to device data using the relative minor.
-        * The device file minor number is the unit number + 1.  0 is
-        * the generic device file - reject it.
-        */
-       i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-       if (i_minor <= 0)
-               return -EINVAL;
-       dd = hfi1_lookup(i_minor - 1);
-       if (!dd) {
-               pr_err("%s: cannot find unit %d!\n", __func__, i_minor);
-               return -EINVAL;
-       }
-
-       /* some devices do not have an EPROM */
-       if (!dd->eprom_available)
-               return -EOPNOTSUPP;
-
-       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd, "%s: unable to acquire EPROM resource\n",
-                          __func__);
-               goto done_asic;
-       }
-
-       dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
-                   __func__, cmd->type, cmd->len, cmd->addr);
-
-       switch (cmd->type) {
-       case HFI1_CMD_EP_INFO:
-               if (cmd->len != sizeof(u32)) {
-                       ret = -ERANGE;
-                       break;
-               }
-               dev_id = read_device_id(dd);
-               /* addr points to a u32 user buffer */
-               if (copy_to_user((void __user *)cmd->addr, &dev_id,
-                                sizeof(u32)))
-                       ret = -EFAULT;
-               break;
-
-       case HFI1_CMD_EP_ERASE_CHIP:
-               ret = erase_chip(dd);
-               break;
-
-       case HFI1_CMD_EP_ERASE_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = erase_range(dd, rstart, rlen);
-               break;
-
-       case HFI1_CMD_EP_READ_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = read_length(dd, rstart, rlen, cmd->addr);
-               break;
-
-       case HFI1_CMD_EP_WRITE_RANGE:
-               rlen = extract_rlen(cmd->len);
-               rstart = extract_rstart(cmd->len);
-               ret = write_length(dd, rstart, rlen, cmd->addr);
-               break;
-
-       default:
-               dd_dev_err(dd, "%s: unexpected command %d\n",
-                          __func__, cmd->type);
-               ret = -EINVAL;
-               break;
-       }
-
-       release_chip_resource(dd, CR_EPROM);
-done_asic:
-       return ret;
-}
-
-/*
- * Initialize the EPROM handler.
- */
-int eprom_init(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       /* only the discrete chip has an EPROM */
-       if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
-               return 0;
-
-       /*
-        * It is OK if both HFIs reset the EPROM as long as they don't
-        * do it at the same time.
-        */
-       ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd,
-                          "%s: unable to acquire EPROM resource, no EPROM support\n",
-                          __func__);
-               goto done_asic;
-       }
-
-       /* reset EPROM to be sure it is in a good state */
-
-       /* set reset */
-       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
-       /* clear reset, set speed */
-       write_csr(dd, ASIC_EEP_CTL_STAT,
-                 EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
-
-       /* wake the device with command "release powerdown NoID" */
-       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
-
-       dd->eprom_available = true;
-       release_chip_resource(dd, CR_EPROM);
-done_asic:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
deleted file mode 100644 (file)
index d41f0b1..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-struct hfi1_cmd;
-struct hfi1_devdata;
-
-int eprom_init(struct hfi1_devdata *dd);
-int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
deleted file mode 100644 (file)
index c1c5bf8..0000000
+++ /dev/null
@@ -1,1773 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/poll.h>
-#include <linux/cdev.h>
-#include <linux/vmalloc.h>
-#include <linux/io.h>
-
-#include <rdma/ib.h>
-
-#include "hfi.h"
-#include "pio.h"
-#include "device.h"
-#include "common.h"
-#include "trace.h"
-#include "user_sdma.h"
-#include "user_exp_rcv.h"
-#include "eprom.h"
-#include "aspm.h"
-#include "mmu_rb.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
-
-/*
- * File operation functions
- */
-static int hfi1_file_open(struct inode *, struct file *);
-static int hfi1_file_close(struct inode *, struct file *);
-static ssize_t hfi1_file_write(struct file *, const char __user *,
-                              size_t, loff_t *);
-static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
-static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
-static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
-
-static u64 kvirt_to_phys(void *);
-static int assign_ctxt(struct file *, struct hfi1_user_info *);
-static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
-static int user_init(struct file *);
-static int get_ctxt_info(struct file *, void __user *, __u32);
-static int get_base_info(struct file *, void __user *, __u32);
-static int setup_ctxt(struct file *);
-static int setup_subctxt(struct hfi1_ctxtdata *);
-static int get_user_context(struct file *, struct hfi1_user_info *,
-                           int, unsigned);
-static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
-static int allocate_ctxt(struct file *, struct hfi1_devdata *,
-                        struct hfi1_user_info *);
-static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
-static unsigned int poll_next(struct file *, struct poll_table_struct *);
-static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
-static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
-static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
-static int vma_fault(struct vm_area_struct *, struct vm_fault *);
-
-static const struct file_operations hfi1_file_ops = {
-       .owner = THIS_MODULE,
-       .write = hfi1_file_write,
-       .write_iter = hfi1_write_iter,
-       .open = hfi1_file_open,
-       .release = hfi1_file_close,
-       .poll = hfi1_poll,
-       .mmap = hfi1_file_mmap,
-       .llseek = noop_llseek,
-};
-
-static struct vm_operations_struct vm_ops = {
-       .fault = vma_fault,
-};
-
-/*
- * Types of memories mapped into user processes' space
- */
-enum mmap_types {
-       PIO_BUFS = 1,
-       PIO_BUFS_SOP,
-       PIO_CRED,
-       RCV_HDRQ,
-       RCV_EGRBUF,
-       UREGS,
-       EVENTS,
-       STATUS,
-       RTAIL,
-       SUBCTXT_UREGS,
-       SUBCTXT_RCV_HDRQ,
-       SUBCTXT_EGRBUF,
-       SDMA_COMP
-};
-
-/*
- * Masks and offsets defining the mmap tokens
- */
-#define HFI1_MMAP_OFFSET_MASK   0xfffULL
-#define HFI1_MMAP_OFFSET_SHIFT  0
-#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
-#define HFI1_MMAP_SUBCTXT_SHIFT 12
-#define HFI1_MMAP_CTXT_MASK     0xffULL
-#define HFI1_MMAP_CTXT_SHIFT    16
-#define HFI1_MMAP_TYPE_MASK     0xfULL
-#define HFI1_MMAP_TYPE_SHIFT    24
-#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
-#define HFI1_MMAP_MAGIC_SHIFT   32
-
-#define HFI1_MMAP_MAGIC         0xdabbad00
-
-#define HFI1_MMAP_TOKEN_SET(field, val)        \
-       (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
-#define HFI1_MMAP_TOKEN_GET(field, token) \
-       (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
-#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
-       (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
-       HFI1_MMAP_TOKEN_SET(TYPE, type) | \
-       HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
-       HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
-       HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
-
-#define dbg(fmt, ...)                          \
-       pr_info(fmt, ##__VA_ARGS__)
-
-static inline int is_valid_mmap(u64 token)
-{
-       return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
-}
-
-static int hfi1_file_open(struct inode *inode, struct file *fp)
-{
-       /* The real work is performed later in assign_ctxt() */
-       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
-       if (fp->private_data) /* no cpu affinity by default */
-               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
-       return fp->private_data ? 0 : -ENOMEM;
-}
-
-static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
-                              size_t count, loff_t *offset)
-{
-       const struct hfi1_cmd __user *ucmd;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_cmd cmd;
-       struct hfi1_user_info uinfo;
-       struct hfi1_tid_info tinfo;
-       unsigned long addr;
-       ssize_t consumed = 0, copy = 0, ret = 0;
-       void *dest = NULL;
-       __u64 user_val = 0;
-       int uctxt_required = 1;
-       int must_be_root = 0;
-
-       /* FIXME: This interface cannot continue out of staging */
-       if (WARN_ON_ONCE(!ib_safe_file_access(fp)))
-               return -EACCES;
-
-       if (count < sizeof(cmd)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ucmd = (const struct hfi1_cmd __user *)data;
-       if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       consumed = sizeof(cmd);
-
-       switch (cmd.type) {
-       case HFI1_CMD_ASSIGN_CTXT:
-               uctxt_required = 0;     /* assigned user context not required */
-               copy = sizeof(uinfo);
-               dest = &uinfo;
-               break;
-       case HFI1_CMD_SDMA_STATUS_UPD:
-       case HFI1_CMD_CREDIT_UPD:
-               copy = 0;
-               break;
-       case HFI1_CMD_TID_UPDATE:
-       case HFI1_CMD_TID_FREE:
-       case HFI1_CMD_TID_INVAL_READ:
-               copy = sizeof(tinfo);
-               dest = &tinfo;
-               break;
-       case HFI1_CMD_USER_INFO:
-       case HFI1_CMD_RECV_CTRL:
-       case HFI1_CMD_POLL_TYPE:
-       case HFI1_CMD_ACK_EVENT:
-       case HFI1_CMD_CTXT_INFO:
-       case HFI1_CMD_SET_PKEY:
-       case HFI1_CMD_CTXT_RESET:
-               copy = 0;
-               user_val = cmd.addr;
-               break;
-       case HFI1_CMD_EP_INFO:
-       case HFI1_CMD_EP_ERASE_CHIP:
-       case HFI1_CMD_EP_ERASE_RANGE:
-       case HFI1_CMD_EP_READ_RANGE:
-       case HFI1_CMD_EP_WRITE_RANGE:
-               uctxt_required = 0;     /* assigned user context not required */
-               must_be_root = 1;       /* validate user */
-               copy = 0;
-               break;
-       default:
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* If the command comes with user data, copy it. */
-       if (copy) {
-               if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               consumed += copy;
-       }
-
-       /*
-        * Make sure there is a uctxt when needed.
-        */
-       if (uctxt_required && !uctxt) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* only root can do these operations */
-       if (must_be_root && !capable(CAP_SYS_ADMIN)) {
-               ret = -EPERM;
-               goto bail;
-       }
-
-       switch (cmd.type) {
-       case HFI1_CMD_ASSIGN_CTXT:
-               ret = assign_ctxt(fp, &uinfo);
-               if (ret < 0)
-                       goto bail;
-               ret = setup_ctxt(fp);
-               if (ret)
-                       goto bail;
-               ret = user_init(fp);
-               break;
-       case HFI1_CMD_CTXT_INFO:
-               ret = get_ctxt_info(fp, (void __user *)(unsigned long)
-                                   user_val, cmd.len);
-               break;
-       case HFI1_CMD_USER_INFO:
-               ret = get_base_info(fp, (void __user *)(unsigned long)
-                                   user_val, cmd.len);
-               break;
-       case HFI1_CMD_SDMA_STATUS_UPD:
-               break;
-       case HFI1_CMD_CREDIT_UPD:
-               if (uctxt && uctxt->sc)
-                       sc_return_credits(uctxt->sc);
-               break;
-       case HFI1_CMD_TID_UPDATE:
-               ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
-               if (!ret) {
-                       /*
-                        * Copy the number of tidlist entries we used
-                        * and the length of the buffer we registered.
-                        * These fields are adjacent in the structure so
-                        * we can copy them at the same time.
-                        */
-                       addr = (unsigned long)cmd.addr +
-                               offsetof(struct hfi1_tid_info, tidcnt);
-                       if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                        sizeof(tinfo.tidcnt) +
-                                        sizeof(tinfo.length)))
-                               ret = -EFAULT;
-               }
-               break;
-       case HFI1_CMD_TID_INVAL_READ:
-               ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
-               if (ret)
-                       break;
-               addr = (unsigned long)cmd.addr +
-                       offsetof(struct hfi1_tid_info, tidcnt);
-               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                sizeof(tinfo.tidcnt)))
-                       ret = -EFAULT;
-               break;
-       case HFI1_CMD_TID_FREE:
-               ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
-               if (ret)
-                       break;
-               addr = (unsigned long)cmd.addr +
-                       offsetof(struct hfi1_tid_info, tidcnt);
-               if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-                                sizeof(tinfo.tidcnt)))
-                       ret = -EFAULT;
-               break;
-       case HFI1_CMD_RECV_CTRL:
-               ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
-               break;
-       case HFI1_CMD_POLL_TYPE:
-               uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
-               break;
-       case HFI1_CMD_ACK_EVENT:
-               ret = user_event_ack(uctxt, fd->subctxt, user_val);
-               break;
-       case HFI1_CMD_SET_PKEY:
-               if (HFI1_CAP_IS_USET(PKEY_CHECK))
-                       ret = set_ctxt_pkey(uctxt, fd->subctxt, user_val);
-               else
-                       ret = -EPERM;
-               break;
-       case HFI1_CMD_CTXT_RESET: {
-               struct send_context *sc;
-               struct hfi1_devdata *dd;
-
-               if (!uctxt || !uctxt->dd || !uctxt->sc) {
-                       ret = -EINVAL;
-                       break;
-               }
-               /*
-                * There is no protection here. User level has to
-                * guarantee that no one will be writing to the send
-                * context while it is being re-initialized.
-                * If user level breaks that guarantee, it will break
-                * it's own context and no one else's.
-                */
-               dd = uctxt->dd;
-               sc = uctxt->sc;
-               /*
-                * Wait until the interrupt handler has marked the
-                * context as halted or frozen. Report error if we time
-                * out.
-                */
-               wait_event_interruptible_timeout(
-                       sc->halt_wait, (sc->flags & SCF_HALTED),
-                       msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-               if (!(sc->flags & SCF_HALTED)) {
-                       ret = -ENOLCK;
-                       break;
-               }
-               /*
-                * If the send context was halted due to a Freeze,
-                * wait until the device has been "unfrozen" before
-                * resetting the context.
-                */
-               if (sc->flags & SCF_FROZEN) {
-                       wait_event_interruptible_timeout(
-                               dd->event_queue,
-                               !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
-                               msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
-                       if (dd->flags & HFI1_FROZEN) {
-                               ret = -ENOLCK;
-                               break;
-                       }
-                       if (dd->flags & HFI1_FORCED_FREEZE) {
-                               /*
-                                * Don't allow context reset if we are into
-                                * forced freeze
-                                */
-                               ret = -ENODEV;
-                               break;
-                       }
-                       sc_disable(sc);
-                       ret = sc_enable(sc);
-                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
-                                    uctxt->ctxt);
-               } else {
-                       ret = sc_restart(sc);
-               }
-               if (!ret)
-                       sc_return_credits(sc);
-               break;
-       }
-       case HFI1_CMD_EP_INFO:
-       case HFI1_CMD_EP_ERASE_CHIP:
-       case HFI1_CMD_EP_ERASE_RANGE:
-       case HFI1_CMD_EP_READ_RANGE:
-       case HFI1_CMD_EP_WRITE_RANGE:
-               ret = handle_eprom_command(fp, &cmd);
-               break;
-       }
-
-       if (ret >= 0)
-               ret = consumed;
-bail:
-       return ret;
-}
-
-static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
-{
-       struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
-       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
-       struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       int ret = 0, done = 0, reqs = 0;
-       unsigned long dim = from->nr_segs;
-
-       if (!cq || !pq) {
-               ret = -EIO;
-               goto done;
-       }
-
-       if (!iter_is_iovec(from) || !dim) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
-                 fd->uctxt->ctxt, fd->subctxt, dim);
-
-       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
-               ret = -ENOSPC;
-               goto done;
-       }
-
-       while (dim) {
-               unsigned long count = 0;
-
-               ret = hfi1_user_sdma_process_request(
-                       kiocb->ki_filp, (struct iovec *)(from->iov + done),
-                       dim, &count);
-               if (ret)
-                       goto done;
-               dim -= count;
-               done += count;
-               reqs++;
-       }
-done:
-       return ret ? ret : reqs;
-}
-
-static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd;
-       unsigned long flags, pfn;
-       u64 token = vma->vm_pgoff << PAGE_SHIFT,
-               memaddr = 0;
-       u8 subctxt, mapio = 0, vmf = 0, type;
-       ssize_t memlen = 0;
-       int ret = 0;
-       u16 ctxt;
-
-       if (!is_valid_mmap(token) || !uctxt ||
-           !(vma->vm_flags & VM_SHARED)) {
-               ret = -EINVAL;
-               goto done;
-       }
-       dd = uctxt->dd;
-       ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
-       subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
-       type = HFI1_MMAP_TOKEN_GET(TYPE, token);
-       if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       flags = vma->vm_flags;
-
-       switch (type) {
-       case PIO_BUFS:
-       case PIO_BUFS_SOP:
-               memaddr = ((dd->physaddr + TXE_PIO_SEND) +
-                               /* chip pio base */
-                          (uctxt->sc->hw_context * BIT(16))) +
-                               /* 64K PIO space / ctxt */
-                       (type == PIO_BUFS_SOP ?
-                               (TXE_PIO_SIZE / 2) : 0); /* sop? */
-               /*
-                * Map only the amount allocated to the context, not the
-                * entire available context's PIO space.
-                */
-               memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
-               flags &= ~VM_MAYREAD;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-               mapio = 1;
-               break;
-       case PIO_CRED:
-               if (flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               /*
-                * The credit return location for this context could be on the
-                * second or third page allocated for credit returns (if number
-                * of enabled contexts > 64 and 128 respectively).
-                */
-               memaddr = dd->cr_base[uctxt->numa_id].pa +
-                       (((u64)uctxt->sc->hw_free -
-                         (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
-               memlen = PAGE_SIZE;
-               flags &= ~VM_MAYWRITE;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               /*
-                * The driver has already allocated memory for credit
-                * returns and programmed it into the chip. Has that
-                * memory been flagged as non-cached?
-                */
-               /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
-               mapio = 1;
-               break;
-       case RCV_HDRQ:
-               memaddr = uctxt->rcvhdrq_phys;
-               memlen = uctxt->rcvhdrq_size;
-               break;
-       case RCV_EGRBUF: {
-               unsigned long addr;
-               int i;
-               /*
-                * The RcvEgr buffer need to be handled differently
-                * as multiple non-contiguous pages need to be mapped
-                * into the user process.
-                */
-               memlen = uctxt->egrbufs.size;
-               if ((vma->vm_end - vma->vm_start) != memlen) {
-                       dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
-                                  (vma->vm_end - vma->vm_start), memlen);
-                       ret = -EINVAL;
-                       goto done;
-               }
-               if (vma->vm_flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               vma->vm_flags &= ~VM_MAYWRITE;
-               addr = vma->vm_start;
-               for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
-                       ret = remap_pfn_range(
-                               vma, addr,
-                               uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
-                               uctxt->egrbufs.buffers[i].len,
-                               vma->vm_page_prot);
-                       if (ret < 0)
-                               goto done;
-                       addr += uctxt->egrbufs.buffers[i].len;
-               }
-               ret = 0;
-               goto done;
-       }
-       case UREGS:
-               /*
-                * Map only the page that contains this context's user
-                * registers.
-                */
-               memaddr = (unsigned long)
-                       (dd->physaddr + RXE_PER_CONTEXT_USER)
-                       + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
-               /*
-                * TidFlow table is on the same page as the rest of the
-                * user registers.
-                */
-               memlen = PAGE_SIZE;
-               flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-               mapio = 1;
-               break;
-       case EVENTS:
-               /*
-                * Use the page where this context's flags are. User level
-                * knows where it's own bitmap is within the page.
-                */
-               memaddr = (unsigned long)(dd->events +
-                                         ((uctxt->ctxt - dd->first_user_ctxt) *
-                                          HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
-               memlen = PAGE_SIZE;
-               /*
-                * v3.7 removes VM_RESERVED but the effect is kept by
-                * using VM_IO.
-                */
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case STATUS:
-               memaddr = kvirt_to_phys((void *)dd->status);
-               memlen = PAGE_SIZE;
-               flags |= VM_IO | VM_DONTEXPAND;
-               break;
-       case RTAIL:
-               if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
-                       /*
-                        * If the memory allocation failed, the context alloc
-                        * also would have failed, so we would never get here
-                        */
-                       ret = -EINVAL;
-                       goto done;
-               }
-               if (flags & VM_WRITE) {
-                       ret = -EPERM;
-                       goto done;
-               }
-               memaddr = uctxt->rcvhdrqtailaddr_phys;
-               memlen = PAGE_SIZE;
-               flags &= ~VM_MAYWRITE;
-               break;
-       case SUBCTXT_UREGS:
-               memaddr = (u64)uctxt->subctxt_uregbase;
-               memlen = PAGE_SIZE;
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case SUBCTXT_RCV_HDRQ:
-               memaddr = (u64)uctxt->subctxt_rcvhdr_base;
-               memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       case SUBCTXT_EGRBUF:
-               memaddr = (u64)uctxt->subctxt_rcvegrbuf;
-               memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
-               flags |= VM_IO | VM_DONTEXPAND;
-               flags &= ~VM_MAYWRITE;
-               vmf = 1;
-               break;
-       case SDMA_COMP: {
-               struct hfi1_user_sdma_comp_q *cq = fd->cq;
-
-               if (!cq) {
-                       ret = -EFAULT;
-                       goto done;
-               }
-               memaddr = (u64)cq->comps;
-               memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
-               flags |= VM_IO | VM_DONTEXPAND;
-               vmf = 1;
-               break;
-       }
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       if ((vma->vm_end - vma->vm_start) != memlen) {
-               hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
-                         uctxt->ctxt, fd->subctxt,
-                         (vma->vm_end - vma->vm_start), memlen);
-               ret = -EINVAL;
-               goto done;
-       }
-
-       vma->vm_flags = flags;
-       hfi1_cdbg(PROC,
-                 "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
-                   ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
-                   vma->vm_end - vma->vm_start, vma->vm_flags);
-       pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
-       if (vmf) {
-               vma->vm_pgoff = pfn;
-               vma->vm_ops = &vm_ops;
-               ret = 0;
-       } else if (mapio) {
-               ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
-                                        vma->vm_page_prot);
-       } else {
-               ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
-                                     vma->vm_page_prot);
-       }
-done:
-       return ret;
-}
-
-/*
- * Local (non-chip) user memory is not mapped right away but as it is
- * accessed by the user-level code.
- */
-static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct page *page;
-
-       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
-       if (!page)
-               return VM_FAULT_SIGBUS;
-
-       get_page(page);
-       vmf->page = page;
-
-       return 0;
-}
-
-static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
-{
-       struct hfi1_ctxtdata *uctxt;
-       unsigned pollflag;
-
-       uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
-       if (!uctxt)
-               pollflag = POLLERR;
-       else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
-               pollflag = poll_urgent(fp, pt);
-       else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
-               pollflag = poll_next(fp, pt);
-       else /* invalid */
-               pollflag = POLLERR;
-
-       return pollflag;
-}
-
-static int hfi1_file_close(struct inode *inode, struct file *fp)
-{
-       struct hfi1_filedata *fdata = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
-       struct hfi1_devdata *dd;
-       unsigned long flags, *ev;
-
-       fp->private_data = NULL;
-
-       if (!uctxt)
-               goto done;
-
-       hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
-       dd = uctxt->dd;
-       mutex_lock(&hfi1_mutex);
-
-       flush_wc();
-       /* drain user sdma queue */
-       hfi1_user_sdma_free_queues(fdata);
-
-       /* release the cpu */
-       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
-
-       /*
-        * Clear any left over, unhandled events so the next process that
-        * gets this context doesn't get confused.
-        */
-       ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
-                          HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
-       *ev = 0;
-
-       if (--uctxt->cnt) {
-               uctxt->active_slaves &= ~(1 << fdata->subctxt);
-               uctxt->subpid[fdata->subctxt] = 0;
-               mutex_unlock(&hfi1_mutex);
-               goto done;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       /*
-        * Disable receive context and interrupt available, reset all
-        * RcvCtxtCtrl bits to default values.
-        */
-       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-                    HFI1_RCVCTRL_TIDFLOW_DIS |
-                    HFI1_RCVCTRL_INTRAVAIL_DIS |
-                    HFI1_RCVCTRL_TAILUPD_DIS |
-                    HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
-                    HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-                    HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
-       /* Clear the context's J_KEY */
-       hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
-       /*
-        * Reset context integrity checks to default.
-        * (writes to CSRs probably belong in chip.c)
-        */
-       write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
-                       hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
-       sc_disable(uctxt->sc);
-       uctxt->pid = 0;
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       dd->rcd[uctxt->ctxt] = NULL;
-
-       hfi1_user_exp_rcv_free(fdata);
-       hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
-
-       uctxt->rcvwait_to = 0;
-       uctxt->piowait_to = 0;
-       uctxt->rcvnowait = 0;
-       uctxt->pionowait = 0;
-       uctxt->event_flags = 0;
-
-       hfi1_stats.sps_ctxts--;
-       if (++dd->freectxts == dd->num_user_contexts)
-               aspm_enable_all(dd);
-       mutex_unlock(&hfi1_mutex);
-       hfi1_free_ctxtdata(dd, uctxt);
-done:
-       kfree(fdata);
-       return 0;
-}
-
-/*
- * Convert kernel *virtual* addresses to physical addresses.
- * This is used to vmalloc'ed addresses.
- */
-static u64 kvirt_to_phys(void *addr)
-{
-       struct page *page;
-       u64 paddr = 0;
-
-       page = vmalloc_to_page(addr);
-       if (page)
-               paddr = page_to_pfn(page) << PAGE_SHIFT;
-
-       return paddr;
-}
-
-static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
-{
-       int i_minor, ret = 0;
-       unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
-
-       swmajor = uinfo->userversion >> 16;
-       if (swmajor != HFI1_USER_SWMAJOR) {
-               ret = -ENODEV;
-               goto done;
-       }
-
-       swminor = uinfo->userversion & 0xffff;
-
-       if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
-               alg = uinfo->hfi1_alg;
-
-       mutex_lock(&hfi1_mutex);
-       /* First, lets check if we need to setup a shared context? */
-       if (uinfo->subctxt_cnt) {
-               struct hfi1_filedata *fd = fp->private_data;
-
-               ret = find_shared_ctxt(fp, uinfo);
-               if (ret < 0)
-                       goto done_unlock;
-               if (ret)
-                       fd->rec_cpu_num = hfi1_get_proc_affinity(
-                               fd->uctxt->dd, fd->uctxt->numa_id);
-       }
-
-       /*
-        * We execute the following block if we couldn't find a
-        * shared context or if context sharing is not required.
-        */
-       if (!ret) {
-               i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
-               ret = get_user_context(fp, uinfo, i_minor - 1, alg);
-       }
-done_unlock:
-       mutex_unlock(&hfi1_mutex);
-done:
-       return ret;
-}
-
-/* return true if the device available for general use */
-static int usable_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       return driver_lstate(ppd) == IB_PORT_ACTIVE;
-}
-
-static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
-                           int devno, unsigned alg)
-{
-       struct hfi1_devdata *dd = NULL;
-       int ret = 0, devmax, npresent, nup, dev;
-
-       devmax = hfi1_count_units(&npresent, &nup);
-       if (!npresent) {
-               ret = -ENXIO;
-               goto done;
-       }
-       if (!nup) {
-               ret = -ENETDOWN;
-               goto done;
-       }
-       if (devno >= 0) {
-               dd = hfi1_lookup(devno);
-               if (!dd)
-                       ret = -ENODEV;
-               else if (!dd->freectxts)
-                       ret = -EBUSY;
-       } else {
-               struct hfi1_devdata *pdd;
-
-               if (alg == HFI1_ALG_ACROSS) {
-                       unsigned free = 0U;
-
-                       for (dev = 0; dev < devmax; dev++) {
-                               pdd = hfi1_lookup(dev);
-                               if (!pdd)
-                                       continue;
-                               if (!usable_device(pdd))
-                                       continue;
-                               if (pdd->freectxts &&
-                                   pdd->freectxts > free) {
-                                       dd = pdd;
-                                       free = pdd->freectxts;
-                               }
-                       }
-               } else {
-                       for (dev = 0; dev < devmax; dev++) {
-                               pdd = hfi1_lookup(dev);
-                               if (!pdd)
-                                       continue;
-                               if (!usable_device(pdd))
-                                       continue;
-                               if (pdd->freectxts) {
-                                       dd = pdd;
-                                       break;
-                               }
-                       }
-               }
-               if (!dd)
-                       ret = -EBUSY;
-       }
-done:
-       return ret ? ret : allocate_ctxt(fp, dd, uinfo);
-}
-
-static int find_shared_ctxt(struct file *fp,
-                           const struct hfi1_user_info *uinfo)
-{
-       int devmax, ndev, i;
-       int ret = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-
-       devmax = hfi1_count_units(NULL, NULL);
-
-       for (ndev = 0; ndev < devmax; ndev++) {
-               struct hfi1_devdata *dd = hfi1_lookup(ndev);
-
-               if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
-                       continue;
-               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
-                       struct hfi1_ctxtdata *uctxt = dd->rcd[i];
-
-                       /* Skip ctxts which are not yet open */
-                       if (!uctxt || !uctxt->cnt)
-                               continue;
-                       /* Skip ctxt if it doesn't match the requested one */
-                       if (memcmp(uctxt->uuid, uinfo->uuid,
-                                  sizeof(uctxt->uuid)) ||
-                           uctxt->jkey != generate_jkey(current_uid()) ||
-                           uctxt->subctxt_id != uinfo->subctxt_id ||
-                           uctxt->subctxt_cnt != uinfo->subctxt_cnt)
-                               continue;
-
-                       /* Verify the sharing process matches the master */
-                       if (uctxt->userversion != uinfo->userversion ||
-                           uctxt->cnt >= uctxt->subctxt_cnt) {
-                               ret = -EINVAL;
-                               goto done;
-                       }
-                       fd->uctxt = uctxt;
-                       fd->subctxt  = uctxt->cnt++;
-                       uctxt->subpid[fd->subctxt] = current->pid;
-                       uctxt->active_slaves |= 1 << fd->subctxt;
-                       ret = 1;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
-                        struct hfi1_user_info *uinfo)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt;
-       unsigned ctxt;
-       int ret, numa;
-
-       if (dd->flags & HFI1_FROZEN) {
-               /*
-                * Pick an error that is unique from all other errors
-                * that are returned so the user process knows that
-                * it tried to allocate while the SPC was frozen.  It
-                * it should be able to retry with success in a short
-                * while.
-                */
-               return -EIO;
-       }
-
-       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
-               if (!dd->rcd[ctxt])
-                       break;
-
-       if (ctxt == dd->num_rcv_contexts)
-               return -EBUSY;
-
-       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
-       if (fd->rec_cpu_num != -1)
-               numa = cpu_to_node(fd->rec_cpu_num);
-       else
-               numa = numa_node_id();
-       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
-       if (!uctxt) {
-               dd_dev_err(dd,
-                          "Unable to allocate ctxtdata memory, failing open\n");
-               return -ENOMEM;
-       }
-       hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
-                 uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
-                 uctxt->numa_id);
-
-       /*
-        * Allocate and enable a PIO send context.
-        */
-       uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
-                            uctxt->dd->node);
-       if (!uctxt->sc)
-               return -ENOMEM;
-
-       hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
-                 uctxt->sc->hw_context);
-       ret = sc_enable(uctxt->sc);
-       if (ret)
-               return ret;
-       /*
-        * Setup shared context resources if the user-level has requested
-        * shared contexts and this is the 'master' process.
-        * This has to be done here so the rest of the sub-contexts find the
-        * proper master.
-        */
-       if (uinfo->subctxt_cnt && !fd->subctxt) {
-               ret = init_subctxts(uctxt, uinfo);
-               /*
-                * On error, we don't need to disable and de-allocate the
-                * send context because it will be done during file close
-                */
-               if (ret)
-                       return ret;
-       }
-       uctxt->userversion = uinfo->userversion;
-       uctxt->pid = current->pid;
-       uctxt->flags = HFI1_CAP_UGET(MASK);
-       init_waitqueue_head(&uctxt->wait);
-       strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
-       memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
-       uctxt->jkey = generate_jkey(current_uid());
-       INIT_LIST_HEAD(&uctxt->sdma_queues);
-       spin_lock_init(&uctxt->sdma_qlock);
-       hfi1_stats.sps_ctxts++;
-       /*
-        * Disable ASPM when there are open user/PSM contexts to avoid
-        * issues with ASPM L1 exit latency
-        */
-       if (dd->freectxts-- == dd->num_user_contexts)
-               aspm_disable_all(dd);
-       fd->uctxt = uctxt;
-
-       return 0;
-}
-
-static int init_subctxts(struct hfi1_ctxtdata *uctxt,
-                        const struct hfi1_user_info *uinfo)
-{
-       unsigned num_subctxts;
-
-       num_subctxts = uinfo->subctxt_cnt;
-       if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
-               return -EINVAL;
-
-       uctxt->subctxt_cnt = uinfo->subctxt_cnt;
-       uctxt->subctxt_id = uinfo->subctxt_id;
-       uctxt->active_slaves = 1;
-       uctxt->redirect_seq_cnt = 1;
-       set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
-
-       return 0;
-}
-
-static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
-{
-       int ret = 0;
-       unsigned num_subctxts = uctxt->subctxt_cnt;
-
-       uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
-       if (!uctxt->subctxt_uregbase) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       /* We can take the size of the RcvHdr Queue from the master */
-       uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
-                                                 num_subctxts);
-       if (!uctxt->subctxt_rcvhdr_base) {
-               ret = -ENOMEM;
-               goto bail_ureg;
-       }
-
-       uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
-                                               num_subctxts);
-       if (!uctxt->subctxt_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail_rhdr;
-       }
-       goto bail;
-bail_rhdr:
-       vfree(uctxt->subctxt_rcvhdr_base);
-bail_ureg:
-       vfree(uctxt->subctxt_uregbase);
-       uctxt->subctxt_uregbase = NULL;
-bail:
-       return ret;
-}
-
-static int user_init(struct file *fp)
-{
-       unsigned int rcvctrl_ops = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-
-       /* make sure that the context has already been setup */
-       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
-               return -EFAULT;
-
-       /* initialize poll variables... */
-       uctxt->urgent = 0;
-       uctxt->urgent_poll = 0;
-
-       /*
-        * Now enable the ctxt for receive.
-        * For chips that are set to DMA the tail register to memory
-        * when they change (and when the update bit transitions from
-        * 0 to 1.  So for those chips, we turn it off and then back on.
-        * This will (very briefly) affect any other open ctxts, but the
-        * duration is very short, and therefore isn't an issue.  We
-        * explicitly set the in-memory tail copy to 0 beforehand, so we
-        * don't have to wait to be sure the DMA update has happened
-        * (chip resets head/tail to 0 on transition to enable).
-        */
-       if (uctxt->rcvhdrtail_kvaddr)
-               clear_rcvhdrtail(uctxt);
-
-       /* Setup J_KEY before enabling the context */
-       hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
-
-       rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
-               rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
-       /*
-        * Ignore the bit in the flags for now until proper
-        * support for multiple packet per rcv array entry is
-        * added.
-        */
-       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
-               rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
-               rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
-               rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
-       /*
-        * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
-        * We can't rely on the correct value to be set from prior
-        * uses of the chip or ctxt. Therefore, add the rcvctrl op
-        * for both cases.
-        */
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
-               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
-       else
-               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
-       hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
-
-       /* Notify any waiting slaves */
-       if (uctxt->subctxt_cnt) {
-               clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
-               wake_up(&uctxt->wait);
-       }
-
-       return 0;
-}
-
-static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
-{
-       struct hfi1_ctxt_info cinfo;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       int ret = 0;
-
-       memset(&cinfo, 0, sizeof(cinfo));
-       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
-       if (ret < 0)
-               goto done;
-       cinfo.num_active = hfi1_count_active_units();
-       cinfo.unit = uctxt->dd->unit;
-       cinfo.ctxt = uctxt->ctxt;
-       cinfo.subctxt = fd->subctxt;
-       cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
-                               uctxt->dd->rcv_entries.group_size) +
-               uctxt->expected_count;
-       cinfo.credits = uctxt->sc->credits;
-       cinfo.numa_node = uctxt->numa_id;
-       cinfo.rec_cpu = fd->rec_cpu_num;
-       cinfo.send_ctxt = uctxt->sc->hw_context;
-
-       cinfo.egrtids = uctxt->egrbufs.alloced;
-       cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-       cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
-       cinfo.sdma_ring_size = fd->cq->nentries;
-       cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
-
-       trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
-       if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
-               ret = -EFAULT;
-done:
-       return ret;
-}
-
-static int setup_ctxt(struct file *fp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       int ret = 0;
-
-       /*
-        * Context should be set up only once, including allocation and
-        * programming of eager buffers. This is done if context sharing
-        * is not requested or by the master process.
-        */
-       if (!uctxt->subctxt_cnt || !fd->subctxt) {
-               ret = hfi1_init_ctxt(uctxt->sc);
-               if (ret)
-                       goto done;
-
-               /* Now allocate the RcvHdr queue and eager buffers. */
-               ret = hfi1_create_rcvhdrq(dd, uctxt);
-               if (ret)
-                       goto done;
-               ret = hfi1_setup_eagerbufs(uctxt);
-               if (ret)
-                       goto done;
-               if (uctxt->subctxt_cnt && !fd->subctxt) {
-                       ret = setup_subctxt(uctxt);
-                       if (ret)
-                               goto done;
-               }
-       } else {
-               ret = wait_event_interruptible(uctxt->wait, !test_bit(
-                                              HFI1_CTXT_MASTER_UNINIT,
-                                              &uctxt->event_flags));
-               if (ret)
-                       goto done;
-       }
-
-       ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
-       if (ret)
-               goto done;
-       /*
-        * Expected receive has to be setup for all processes (including
-        * shared contexts). However, it has to be done after the master
-        * context has been fully configured as it depends on the
-        * eager/expected split of the RcvArray entries.
-        * Setting it up here ensures that the subcontexts will be waiting
-        * (due to the above wait_event_interruptible() until the master
-        * is setup.
-        */
-       ret = hfi1_user_exp_rcv_init(fp);
-       if (ret)
-               goto done;
-
-       set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
-done:
-       return ret;
-}
-
-static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
-{
-       struct hfi1_base_info binfo;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       ssize_t sz;
-       unsigned offset;
-       int ret = 0;
-
-       trace_hfi1_uctxtdata(uctxt->dd, uctxt);
-
-       memset(&binfo, 0, sizeof(binfo));
-       binfo.hw_version = dd->revision;
-       binfo.sw_version = HFI1_KERN_SWVERSION;
-       binfo.bthqp = kdeth_qp;
-       binfo.jkey = uctxt->jkey;
-       /*
-        * If more than 64 contexts are enabled the allocated credit
-        * return will span two or three contiguous pages. Since we only
-        * map the page containing the context's credit return address,
-        * we need to calculate the offset in the proper page.
-        */
-       offset = ((u64)uctxt->sc->hw_free -
-                 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
-       binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
-                                               fd->subctxt, offset);
-       binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
-                                           fd->subctxt,
-                                           uctxt->sc->base_addr);
-       binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
-                                               uctxt->ctxt,
-                                               fd->subctxt,
-                                               uctxt->sc->base_addr);
-       binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
-                                              fd->subctxt,
-                                              uctxt->rcvhdrq);
-       binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
-                                              fd->subctxt,
-                                              uctxt->egrbufs.rcvtids[0].phys);
-       binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
-                                                fd->subctxt, 0);
-       /*
-        * user regs are at
-        * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
-        */
-       binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
-                                           fd->subctxt, 0);
-       offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
-                   HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
-                 sizeof(*dd->events));
-       binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
-                                             fd->subctxt,
-                                             offset);
-       binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
-                                             fd->subctxt,
-                                             dd->status);
-       if (HFI1_CAP_IS_USET(DMA_RTAIL))
-               binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
-                                                      fd->subctxt, 0);
-       if (uctxt->subctxt_cnt) {
-               binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
-                                                       uctxt->ctxt,
-                                                       fd->subctxt, 0);
-               binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
-                                                        uctxt->ctxt,
-                                                        fd->subctxt, 0);
-               binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
-                                                        uctxt->ctxt,
-                                                        fd->subctxt, 0);
-       }
-       sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
-       if (copy_to_user(ubase, &binfo, sz))
-               ret = -EFAULT;
-       return ret;
-}
-
-static unsigned int poll_urgent(struct file *fp,
-                               struct poll_table_struct *pt)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned pollflag;
-
-       poll_wait(fp, &uctxt->wait, pt);
-
-       spin_lock_irq(&dd->uctxt_lock);
-       if (uctxt->urgent != uctxt->urgent_poll) {
-               pollflag = POLLIN | POLLRDNORM;
-               uctxt->urgent_poll = uctxt->urgent;
-       } else {
-               pollflag = 0;
-               set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
-       }
-       spin_unlock_irq(&dd->uctxt_lock);
-
-       return pollflag;
-}
-
-static unsigned int poll_next(struct file *fp,
-                             struct poll_table_struct *pt)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned pollflag;
-
-       poll_wait(fp, &uctxt->wait, pt);
-
-       spin_lock_irq(&dd->uctxt_lock);
-       if (hdrqempty(uctxt)) {
-               set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
-               pollflag = 0;
-       } else {
-               pollflag = POLLIN | POLLRDNORM;
-       }
-       spin_unlock_irq(&dd->uctxt_lock);
-
-       return pollflag;
-}
-
-/*
- * Find all user contexts in use, and set the specified bit in their
- * event mask.
- * See also find_ctxt() for a similar use, that is specific to send buffers.
- */
-int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
-{
-       struct hfi1_ctxtdata *uctxt;
-       struct hfi1_devdata *dd = ppd->dd;
-       unsigned ctxt;
-       int ret = 0;
-       unsigned long flags;
-
-       if (!dd->events) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
-            ctxt++) {
-               uctxt = dd->rcd[ctxt];
-               if (uctxt) {
-                       unsigned long *evs = dd->events +
-                               (uctxt->ctxt - dd->first_user_ctxt) *
-                               HFI1_MAX_SHARED_CTXTS;
-                       int i;
-                       /*
-                        * subctxt_cnt is 0 if not shared, so do base
-                        * separately, first, then remaining subctxt, if any
-                        */
-                       set_bit(evtbit, evs);
-                       for (i = 1; i < uctxt->subctxt_cnt; i++)
-                               set_bit(evtbit, evs + i);
-               }
-       }
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-done:
-       return ret;
-}
-
-/**
- * manage_rcvq - manage a context's receive queue
- * @uctxt: the context
- * @subctxt: the sub-context
- * @start_stop: action to carry out
- *
- * start_stop == 0 disables receive on the context, for use in queue
- * overflow conditions.  start_stop==1 re-enables, to be used to
- * re-init the software copy of the head register
- */
-static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
-                      int start_stop)
-{
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned int rcvctrl_op;
-
-       if (subctxt)
-               goto bail;
-       /* atomically clear receive enable ctxt. */
-       if (start_stop) {
-               /*
-                * On enable, force in-memory copy of the tail register to
-                * 0, so that protocol code doesn't have to worry about
-                * whether or not the chip has yet updated the in-memory
-                * copy or not on return from the system call. The chip
-                * always resets it's tail register back to 0 on a
-                * transition from disabled to enabled.
-                */
-               if (uctxt->rcvhdrtail_kvaddr)
-                       clear_rcvhdrtail(uctxt);
-               rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
-       } else {
-               rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
-       }
-       hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
-       /* always; new head should be equal to new tail; see above */
-bail:
-       return 0;
-}
-
-/*
- * clear the event notifier events for this context.
- * User process then performs actions appropriate to bit having been
- * set, if desired, and checks again in future.
- */
-static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
-                         unsigned long events)
-{
-       int i;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned long *evs;
-
-       if (!dd->events)
-               return 0;
-
-       evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
-                           HFI1_MAX_SHARED_CTXTS) + subctxt;
-
-       for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
-               if (!test_bit(i, &events))
-                       continue;
-               clear_bit(i, evs);
-       }
-       return 0;
-}
-
-static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
-                        u16 pkey)
-{
-       int ret = -ENOENT, i, intable = 0;
-       struct hfi1_pportdata *ppd = uctxt->ppd;
-       struct hfi1_devdata *dd = uctxt->dd;
-
-       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
-               if (pkey == ppd->pkeys[i]) {
-                       intable = 1;
-                       break;
-               }
-
-       if (intable)
-               ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
-done:
-       return ret;
-}
-
-static int ui_open(struct inode *inode, struct file *filp)
-{
-       struct hfi1_devdata *dd;
-
-       dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
-       filp->private_data = dd; /* for other methods */
-       return 0;
-}
-
-static int ui_release(struct inode *inode, struct file *filp)
-{
-       /* nothing to do */
-       return 0;
-}
-
-static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-
-       return fixed_size_llseek(filp, offset, whence,
-               (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE);
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
-                      loff_t *f_pos)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-       void __iomem *base = dd->kregbase;
-       unsigned long total, csr_off,
-               barlen = (dd->kregend - dd->kregbase);
-       u64 data;
-
-       /* only read 8 byte quantities */
-       if ((count % 8) != 0)
-               return -EINVAL;
-       /* offset must be 8-byte aligned */
-       if ((*f_pos % 8) != 0)
-               return -EINVAL;
-       /* destination buffer must be 8-byte aligned */
-       if ((unsigned long)buf % 8 != 0)
-               return -EINVAL;
-       /* must be in range */
-       if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
-               return -EINVAL;
-       /* only set the base if we are not starting past the BAR */
-       if (*f_pos < barlen)
-               base += *f_pos;
-       csr_off = *f_pos;
-       for (total = 0; total < count; total += 8, csr_off += 8) {
-               /* accessing LCB CSRs requires more checks */
-               if (is_lcb_offset(csr_off)) {
-                       if (read_lcb_csr(dd, csr_off, (u64 *)&data))
-                               break; /* failed */
-               }
-               /*
-                * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
-                * false parity error.  Avoid the whole issue by not reading
-                * them.  These registers are defined as having a read value
-                * of 0.
-                */
-               else if (csr_off == ASIC_GPIO_CLEAR ||
-                        csr_off == ASIC_GPIO_FORCE ||
-                        csr_off == ASIC_QSFP1_CLEAR ||
-                        csr_off == ASIC_QSFP1_FORCE ||
-                        csr_off == ASIC_QSFP2_CLEAR ||
-                        csr_off == ASIC_QSFP2_FORCE)
-                       data = 0;
-               else if (csr_off >= barlen) {
-                       /*
-                        * read_8051_data can read more than just 8 bytes at
-                        * a time. However, folding this into the loop and
-                        * handling the reads in 8 byte increments allows us
-                        * to smoothly transition from chip memory to 8051
-                        * memory.
-                        */
-                       if (read_8051_data(dd,
-                                          (u32)(csr_off - barlen),
-                                          sizeof(data), &data))
-                               break; /* failed */
-               } else
-                       data = readq(base + total);
-               if (put_user(data, (unsigned long __user *)(buf + total)))
-                       break;
-       }
-       *f_pos += total;
-       return total;
-}
-
-/* NOTE: assumes unsigned long is 8 bytes */
-static ssize_t ui_write(struct file *filp, const char __user *buf,
-                       size_t count, loff_t *f_pos)
-{
-       struct hfi1_devdata *dd = filp->private_data;
-       void __iomem *base;
-       unsigned long total, data, csr_off;
-       int in_lcb;
-
-       /* only write 8 byte quantities */
-       if ((count % 8) != 0)
-               return -EINVAL;
-       /* offset must be 8-byte aligned */
-       if ((*f_pos % 8) != 0)
-               return -EINVAL;
-       /* source buffer must be 8-byte aligned */
-       if ((unsigned long)buf % 8 != 0)
-               return -EINVAL;
-       /* must be in range */
-       if (*f_pos + count > dd->kregend - dd->kregbase)
-               return -EINVAL;
-
-       base = (void __iomem *)dd->kregbase + *f_pos;
-       csr_off = *f_pos;
-       in_lcb = 0;
-       for (total = 0; total < count; total += 8, csr_off += 8) {
-               if (get_user(data, (unsigned long __user *)(buf + total)))
-                       break;
-               /* accessing LCB CSRs requires a special procedure */
-               if (is_lcb_offset(csr_off)) {
-                       if (!in_lcb) {
-                               int ret = acquire_lcb_access(dd, 1);
-
-                               if (ret)
-                                       break;
-                               in_lcb = 1;
-                       }
-               } else {
-                       if (in_lcb) {
-                               release_lcb_access(dd, 1);
-                               in_lcb = 0;
-                       }
-               }
-               writeq(data, base + total);
-       }
-       if (in_lcb)
-               release_lcb_access(dd, 1);
-       *f_pos += total;
-       return total;
-}
-
-static const struct file_operations ui_file_ops = {
-       .owner = THIS_MODULE,
-       .llseek = ui_lseek,
-       .read = ui_read,
-       .write = ui_write,
-       .open = ui_open,
-       .release = ui_release,
-};
-
-#define UI_OFFSET 192  /* device minor offset for UI devices */
-static int create_ui = 1;
-
-static struct cdev wildcard_cdev;
-static struct device *wildcard_device;
-
-static atomic_t user_count = ATOMIC_INIT(0);
-
-static void user_remove(struct hfi1_devdata *dd)
-{
-       if (atomic_dec_return(&user_count) == 0)
-               hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
-
-       hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
-       hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
-}
-
-static int user_add(struct hfi1_devdata *dd)
-{
-       char name[10];
-       int ret;
-
-       if (atomic_inc_return(&user_count) == 1) {
-               ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
-                                    &wildcard_cdev, &wildcard_device,
-                                    true);
-               if (ret)
-                       goto done;
-       }
-
-       snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
-       ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
-                            &dd->user_cdev, &dd->user_device,
-                            true);
-       if (ret)
-               goto done;
-
-       if (create_ui) {
-               snprintf(name, sizeof(name),
-                        "%s_ui%d", class_name(), dd->unit);
-               ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
-                                    &dd->ui_cdev, &dd->ui_device,
-                                    false);
-               if (ret)
-                       goto done;
-       }
-
-       return 0;
-done:
-       user_remove(dd);
-       return ret;
-}
-
-/*
- * Create per-unit files in /dev
- */
-int hfi1_device_create(struct hfi1_devdata *dd)
-{
-       int r, ret;
-
-       r = user_add(dd);
-       ret = hfi1_diag_add(dd);
-       if (r && !ret)
-               ret = r;
-       return ret;
-}
-
-/*
- * Remove per-unit files in /dev
- * void, core kernel returns no errors for this stuff
- */
-void hfi1_device_remove(struct hfi1_devdata *dd)
-{
-       user_remove(dd);
-       hfi1_diag_remove(dd);
-}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
deleted file mode 100644 (file)
index ed680fd..0000000
+++ /dev/null
@@ -1,2056 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/firmware.h>
-#include <linux/mutex.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/crc32.h>
-
-#include "hfi.h"
-#include "trace.h"
-
-/*
- * Make it easy to toggle firmware file name and if it gets loaded by
- * editing the following. This may be something we do while in development
- * but not necessarily something a user would ever need to use.
- */
-#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
-#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
-#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
-#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
-#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
-#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
-#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
-#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
-#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
-#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
-
-static uint fw_8051_load = 1;
-static uint fw_fabric_serdes_load = 1;
-static uint fw_pcie_serdes_load = 1;
-static uint fw_sbus_load = 1;
-
-/*
- * Access required in platform.c
- * Maintains state of whether the platform config was fetched via the
- * fallback option
- */
-uint platform_config_load;
-
-/* Firmware file names get set in hfi1_firmware_init() based on the above */
-static char *fw_8051_name;
-static char *fw_fabric_serdes_name;
-static char *fw_sbus_name;
-static char *fw_pcie_serdes_name;
-static char *platform_config_name;
-
-#define SBUS_MAX_POLL_COUNT 100
-#define SBUS_COUNTER(reg, name) \
-       (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
-        ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
-
-/*
- * Firmware security header.
- */
-struct css_header {
-       u32 module_type;
-       u32 header_len;
-       u32 header_version;
-       u32 module_id;
-       u32 module_vendor;
-       u32 date;               /* BCD yyyymmdd */
-       u32 size;               /* in DWORDs */
-       u32 key_size;           /* in DWORDs */
-       u32 modulus_size;       /* in DWORDs */
-       u32 exponent_size;      /* in DWORDs */
-       u32 reserved[22];
-};
-
-/* expected field values */
-#define CSS_MODULE_TYPE           0x00000006
-#define CSS_HEADER_LEN    0x000000a1
-#define CSS_HEADER_VERSION 0x00010000
-#define CSS_MODULE_VENDOR  0x00008086
-
-#define KEY_SIZE      256
-#define MU_SIZE                8
-#define EXPONENT_SIZE  4
-
-/* the file itself */
-struct firmware_file {
-       struct css_header css_header;
-       u8 modulus[KEY_SIZE];
-       u8 exponent[EXPONENT_SIZE];
-       u8 signature[KEY_SIZE];
-       u8 firmware[];
-};
-
-struct augmented_firmware_file {
-       struct css_header css_header;
-       u8 modulus[KEY_SIZE];
-       u8 exponent[EXPONENT_SIZE];
-       u8 signature[KEY_SIZE];
-       u8 r2[KEY_SIZE];
-       u8 mu[MU_SIZE];
-       u8 firmware[];
-};
-
-/* augmented file size difference */
-#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
-                                               sizeof(struct firmware_file))
-
-struct firmware_details {
-       /* Linux core piece */
-       const struct firmware *fw;
-
-       struct css_header *css_header;
-       u8 *firmware_ptr;               /* pointer to binary data */
-       u32 firmware_len;               /* length in bytes */
-       u8 *modulus;                    /* pointer to the modulus */
-       u8 *exponent;                   /* pointer to the exponent */
-       u8 *signature;                  /* pointer to the signature */
-       u8 *r2;                         /* pointer to r2 */
-       u8 *mu;                         /* pointer to mu */
-       struct augmented_firmware_file dummy_header;
-};
-
-/*
- * The mutex protects fw_state, fw_err, and all of the firmware_details
- * variables.
- */
-static DEFINE_MUTEX(fw_mutex);
-enum fw_state {
-       FW_EMPTY,
-       FW_TRY,
-       FW_FINAL,
-       FW_ERR
-};
-
-static enum fw_state fw_state = FW_EMPTY;
-static int fw_err;
-static struct firmware_details fw_8051;
-static struct firmware_details fw_fabric;
-static struct firmware_details fw_pcie;
-static struct firmware_details fw_sbus;
-static const struct firmware *platform_config;
-
-/* flags for turn_off_spicos() */
-#define SPICO_SBUS   0x1
-#define SPICO_FABRIC 0x2
-#define ENABLE_SPICO_SMASK 0x1
-
-/* security block commands */
-#define RSA_CMD_INIT  0x1
-#define RSA_CMD_START 0x2
-
-/* security block status */
-#define RSA_STATUS_IDLE   0x0
-#define RSA_STATUS_ACTIVE 0x1
-#define RSA_STATUS_DONE   0x2
-#define RSA_STATUS_FAILED 0x3
-
-/* RSA engine timeout, in ms */
-#define RSA_ENGINE_TIMEOUT 100 /* ms */
-
-/* hardware mutex timeout, in ms */
-#define HM_TIMEOUT 10 /* ms */
-
-/* 8051 memory access timeout, in us */
-#define DC8051_ACCESS_TIMEOUT 100 /* us */
-
-/* the number of fabric SerDes on the SBus */
-#define NUM_FABRIC_SERDES 4
-
-/* SBus fabric SerDes addresses, one set per HFI */
-static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
-       { 0x01, 0x02, 0x03, 0x04 },
-       { 0x28, 0x29, 0x2a, 0x2b }
-};
-
-/* SBus PCIe SerDes addresses, one set per HFI */
-static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
-       { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
-         0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
-       { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
-         0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
-};
-
-/* SBus PCIe PCS addresses, one set per HFI */
-const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
-       { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
-         0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
-       { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
-         0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
-};
-
-/* SBus fabric SerDes broadcast addresses, one per HFI */
-static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
-static const u8 all_fabric_serdes_broadcast = 0xe1;
-
-/* SBus PCIe SerDes broadcast addresses, one per HFI */
-const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
-static const u8 all_pcie_serdes_broadcast = 0xe0;
-
-/* forwards */
-static void dispose_one_firmware(struct firmware_details *fdet);
-static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
-                                      struct firmware_details *fdet);
-
-/*
- * Read a single 64-bit value from 8051 data memory.
- *
- * Expects:
- * o caller to have already set up data read, no auto increment
- * o caller to turn off read enable when finished
- *
- * The address argument is a byte offset.  Bits 0:2 in the address are
- * ignored - i.e. the hardware will always do aligned 8-byte reads as if
- * the lower bits are zero.
- *
- * Return 0 on success, -ENXIO on a read error (timeout).
- */
-static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
-{
-       u64 reg;
-       int count;
-
-       /* start the read at the given address */
-       reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
-                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
-               | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
-
-       /* wait until ACCESS_COMPLETED is set */
-       count = 0;
-       while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
-                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
-                   == 0) {
-               count++;
-               if (count > DC8051_ACCESS_TIMEOUT) {
-                       dd_dev_err(dd, "timeout reading 8051 data\n");
-                       return -ENXIO;
-               }
-               ndelay(10);
-       }
-
-       /* gather the data */
-       *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
-
-       return 0;
-}
-
-/*
- * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
- * Return 0 on success, -errno on error.
- */
-int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
-{
-       unsigned long flags;
-       u32 done;
-       int ret = 0;
-
-       spin_lock_irqsave(&dd->dc8051_memlock, flags);
-
-       /* data read set-up, no auto-increment */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
-
-       for (done = 0; done < len; addr += 8, done += 8, result++) {
-               ret = __read_8051_data(dd, addr, result);
-               if (ret)
-                       break;
-       }
-
-       /* turn off read enable */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
-
-       spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
-
-       return ret;
-}
-
-/*
- * Write data or code to the 8051 code or data RAM.
- */
-static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
-                     const u8 *data, u32 len)
-{
-       u64 reg;
-       u32 offset;
-       int aligned, count;
-
-       /* check alignment */
-       aligned = ((unsigned long)data & 0x7) == 0;
-
-       /* write set-up */
-       reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
-               | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
-
-       reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
-                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
-               | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
-
-       /* write */
-       for (offset = 0; offset < len; offset += 8) {
-               int bytes = len - offset;
-
-               if (bytes < 8) {
-                       reg = 0;
-                       memcpy(&reg, &data[offset], bytes);
-               } else if (aligned) {
-                       reg = *(u64 *)&data[offset];
-               } else {
-                       memcpy(&reg, &data[offset], 8);
-               }
-               write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
-
-               /* wait until ACCESS_COMPLETED is set */
-               count = 0;
-               while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
-                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
-                   == 0) {
-                       count++;
-                       if (count > DC8051_ACCESS_TIMEOUT) {
-                               dd_dev_err(dd, "timeout writing 8051 data\n");
-                               return -ENXIO;
-                       }
-                       udelay(1);
-               }
-       }
-
-       /* turn off write access, auto increment (also sets to data access) */
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
-       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
-
-       return 0;
-}
-
-/* return 0 if values match, non-zero and complain otherwise */
-static int invalid_header(struct hfi1_devdata *dd, const char *what,
-                         u32 actual, u32 expected)
-{
-       if (actual == expected)
-               return 0;
-
-       dd_dev_err(dd,
-                  "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
-                  what, expected, actual);
-       return 1;
-}
-
-/*
- * Verify that the static fields in the CSS header match.
- */
-static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
-{
-       /* verify CSS header fields (most sizes are in DW, so add /4) */
-       if (invalid_header(dd, "module_type", css->module_type,
-                          CSS_MODULE_TYPE) ||
-           invalid_header(dd, "header_len", css->header_len,
-                          (sizeof(struct firmware_file) / 4)) ||
-           invalid_header(dd, "header_version", css->header_version,
-                          CSS_HEADER_VERSION) ||
-           invalid_header(dd, "module_vendor", css->module_vendor,
-                          CSS_MODULE_VENDOR) ||
-           invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
-           invalid_header(dd, "modulus_size", css->modulus_size,
-                          KEY_SIZE / 4) ||
-           invalid_header(dd, "exponent_size", css->exponent_size,
-                          EXPONENT_SIZE / 4)) {
-               return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Make sure there are at least some bytes after the prefix.
- */
-static int payload_check(struct hfi1_devdata *dd, const char *name,
-                        long file_size, long prefix_size)
-{
-       /* make sure we have some payload */
-       if (prefix_size >= file_size) {
-               dd_dev_err(dd,
-                          "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
-                          name, file_size, prefix_size);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-/*
- * Request the firmware from the system.  Extract the pieces and fill in
- * fdet.  If successful, the caller will need to call dispose_one_firmware().
- * Returns 0 on success, -ERRNO on error.
- */
-static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
-                              struct firmware_details *fdet)
-{
-       struct css_header *css;
-       int ret;
-
-       memset(fdet, 0, sizeof(*fdet));
-
-       ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
-       if (ret) {
-               dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
-                           name, ret);
-               return ret;
-       }
-
-       /* verify the firmware */
-       if (fdet->fw->size < sizeof(struct css_header)) {
-               dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
-               ret = -EINVAL;
-               goto done;
-       }
-       css = (struct css_header *)fdet->fw->data;
-
-       hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
-       hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
-       hfi1_cdbg(FIRMWARE, "CSS structure:");
-       hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
-       hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
-                 css->header_len, 4 * css->header_len);
-       hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
-       hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
-       hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
-       hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
-       hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
-                 css->size, 4 * css->size);
-       hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
-                 css->key_size, 4 * css->key_size);
-       hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
-                 css->modulus_size, 4 * css->modulus_size);
-       hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
-                 css->exponent_size, 4 * css->exponent_size);
-       hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
-                 fdet->fw->size - sizeof(struct firmware_file));
-
-       /*
-        * If the file does not have a valid CSS header, fail.
-        * Otherwise, check the CSS size field for an expected size.
-        * The augmented file has r2 and mu inserted after the header
-        * was generated, so there will be a known difference between
-        * the CSS header size and the actual file size.  Use this
-        * difference to identify an augmented file.
-        *
-        * Note: css->size is in DWORDs, multiply by 4 to get bytes.
-        */
-       ret = verify_css_header(dd, css);
-       if (ret) {
-               dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
-       } else if ((css->size * 4) == fdet->fw->size) {
-               /* non-augmented firmware file */
-               struct firmware_file *ff = (struct firmware_file *)
-                                                       fdet->fw->data;
-
-               /* make sure there are bytes in the payload */
-               ret = payload_check(dd, name, fdet->fw->size,
-                                   sizeof(struct firmware_file));
-               if (ret == 0) {
-                       fdet->css_header = css;
-                       fdet->modulus = ff->modulus;
-                       fdet->exponent = ff->exponent;
-                       fdet->signature = ff->signature;
-                       fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
-                       fdet->mu = fdet->dummy_header.mu; /* use dummy space */
-                       fdet->firmware_ptr = ff->firmware;
-                       fdet->firmware_len = fdet->fw->size -
-                                               sizeof(struct firmware_file);
-                       /*
-                        * Header does not include r2 and mu - generate here.
-                        * For now, fail.
-                        */
-                       dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
-                       ret = -EINVAL;
-               }
-       } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
-               /* augmented firmware file */
-               struct augmented_firmware_file *aff =
-                       (struct augmented_firmware_file *)fdet->fw->data;
-
-               /* make sure there are bytes in the payload */
-               ret = payload_check(dd, name, fdet->fw->size,
-                                   sizeof(struct augmented_firmware_file));
-               if (ret == 0) {
-                       fdet->css_header = css;
-                       fdet->modulus = aff->modulus;
-                       fdet->exponent = aff->exponent;
-                       fdet->signature = aff->signature;
-                       fdet->r2 = aff->r2;
-                       fdet->mu = aff->mu;
-                       fdet->firmware_ptr = aff->firmware;
-                       fdet->firmware_len = fdet->fw->size -
-                                       sizeof(struct augmented_firmware_file);
-               }
-       } else {
-               /* css->size check failed */
-               dd_dev_err(dd,
-                          "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
-                          fdet->fw->size / 4,
-                          (fdet->fw->size - AUGMENT_SIZE) / 4,
-                          css->size);
-
-               ret = -EINVAL;
-       }
-
-done:
-       /* if returning an error, clean up after ourselves */
-       if (ret)
-               dispose_one_firmware(fdet);
-       return ret;
-}
-
-static void dispose_one_firmware(struct firmware_details *fdet)
-{
-       release_firmware(fdet->fw);
-       /* erase all previous information */
-       memset(fdet, 0, sizeof(*fdet));
-}
-
-/*
- * Obtain the 4 firmwares from the OS.  All must be obtained at once or not
- * at all.  If called with the firmware state in FW_TRY, use alternate names.
- * On exit, this routine will have set the firmware state to one of FW_TRY,
- * FW_FINAL, or FW_ERR.
- *
- * Must be holding fw_mutex.
- */
-static void __obtain_firmware(struct hfi1_devdata *dd)
-{
-       int err = 0;
-
-       if (fw_state == FW_FINAL)       /* nothing more to obtain */
-               return;
-       if (fw_state == FW_ERR)         /* already in error */
-               return;
-
-       /* fw_state is FW_EMPTY or FW_TRY */
-retry:
-       if (fw_state == FW_TRY) {
-               /*
-                * We tried the original and it failed.  Move to the
-                * alternate.
-                */
-               dd_dev_warn(dd, "using alternate firmware names\n");
-               /*
-                * Let others run.  Some systems, when missing firmware, does
-                * something that holds for 30 seconds.  If we do that twice
-                * in a row it triggers task blocked warning.
-                */
-               cond_resched();
-               if (fw_8051_load)
-                       dispose_one_firmware(&fw_8051);
-               if (fw_fabric_serdes_load)
-                       dispose_one_firmware(&fw_fabric);
-               if (fw_sbus_load)
-                       dispose_one_firmware(&fw_sbus);
-               if (fw_pcie_serdes_load)
-                       dispose_one_firmware(&fw_pcie);
-               fw_8051_name = ALT_FW_8051_NAME_ASIC;
-               fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
-               fw_sbus_name = ALT_FW_SBUS_NAME;
-               fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
-       }
-
-       if (fw_sbus_load) {
-               err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_pcie_serdes_load) {
-               err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_fabric_serdes_load) {
-               err = obtain_one_firmware(dd, fw_fabric_serdes_name,
-                                         &fw_fabric);
-               if (err)
-                       goto done;
-       }
-
-       if (fw_8051_load) {
-               err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
-               if (err)
-                       goto done;
-       }
-
-done:
-       if (err) {
-               /* oops, had problems obtaining a firmware */
-               if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
-                       /* retry with alternate (RTL only) */
-                       fw_state = FW_TRY;
-                       goto retry;
-               }
-               dd_dev_err(dd, "unable to obtain working firmware\n");
-               fw_state = FW_ERR;
-               fw_err = -ENOENT;
-       } else {
-               /* success */
-               if (fw_state == FW_EMPTY &&
-                   dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
-                       fw_state = FW_TRY;      /* may retry later */
-               else
-                       fw_state = FW_FINAL;    /* cannot try again */
-       }
-}
-
-/*
- * Called by all HFIs when loading their firmware - i.e. device probe time.
- * The first one will do the actual firmware load.  Use a mutex to resolve
- * any possible race condition.
- *
- * The call to this routine cannot be moved to driver load because the kernel
- * call request_firmware() requires a device which is only available after
- * the first device probe.
- */
-static int obtain_firmware(struct hfi1_devdata *dd)
-{
-       unsigned long timeout;
-       int err = 0;
-
-       mutex_lock(&fw_mutex);
-
-       /* 40s delay due to long delay on missing firmware on some systems */
-       timeout = jiffies + msecs_to_jiffies(40000);
-       while (fw_state == FW_TRY) {
-               /*
-                * Another device is trying the firmware.  Wait until it
-                * decides what works (or not).
-                */
-               if (time_after(jiffies, timeout)) {
-                       /* waited too long */
-                       dd_dev_err(dd, "Timeout waiting for firmware try");
-                       fw_state = FW_ERR;
-                       fw_err = -ETIMEDOUT;
-                       break;
-               }
-               mutex_unlock(&fw_mutex);
-               msleep(20);     /* arbitrary delay */
-               mutex_lock(&fw_mutex);
-       }
-       /* not in FW_TRY state */
-
-       if (fw_state == FW_FINAL) {
-               if (platform_config) {
-                       dd->platform_config.data = platform_config->data;
-                       dd->platform_config.size = platform_config->size;
-               }
-               goto done;      /* already acquired */
-       } else if (fw_state == FW_ERR) {
-               goto done;      /* already tried and failed */
-       }
-       /* fw_state is FW_EMPTY */
-
-       /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
-       __obtain_firmware(dd);
-
-       if (platform_config_load) {
-               platform_config = NULL;
-               err = request_firmware(&platform_config, platform_config_name,
-                                      &dd->pcidev->dev);
-               if (err) {
-                       platform_config = NULL;
-                       goto done;
-               }
-               dd->platform_config.data = platform_config->data;
-               dd->platform_config.size = platform_config->size;
-       }
-
-done:
-       mutex_unlock(&fw_mutex);
-
-       return fw_err;
-}
-
-/*
- * Called when the driver unloads.  The timing is asymmetric with its
- * counterpart, obtain_firmware().  If called at device remove time,
- * then it is conceivable that another device could probe while the
- * firmware is being disposed.  The mutexes can be moved to do that
- * safely, but then the firmware would be requested from the OS multiple
- * times.
- *
- * No mutex is needed as the driver is unloading and there cannot be any
- * other callers.
- */
-void dispose_firmware(void)
-{
-       dispose_one_firmware(&fw_8051);
-       dispose_one_firmware(&fw_fabric);
-       dispose_one_firmware(&fw_pcie);
-       dispose_one_firmware(&fw_sbus);
-
-       release_firmware(platform_config);
-       platform_config = NULL;
-
-       /* retain the error state, otherwise revert to empty */
-       if (fw_state != FW_ERR)
-               fw_state = FW_EMPTY;
-}
-
-/*
- * Called with the result of a firmware download.
- *
- * Return 1 to retry loading the firmware, 0 to stop.
- */
-static int retry_firmware(struct hfi1_devdata *dd, int load_result)
-{
-       int retry;
-
-       mutex_lock(&fw_mutex);
-
-       if (load_result == 0) {
-               /*
-                * The load succeeded, so expect all others to do the same.
-                * Do not retry again.
-                */
-               if (fw_state == FW_TRY)
-                       fw_state = FW_FINAL;
-               retry = 0;      /* do NOT retry */
-       } else if (fw_state == FW_TRY) {
-               /* load failed, obtain alternate firmware */
-               __obtain_firmware(dd);
-               retry = (fw_state == FW_FINAL);
-       } else {
-               /* else in FW_FINAL or FW_ERR, no retry in either case */
-               retry = 0;
-       }
-
-       mutex_unlock(&fw_mutex);
-       return retry;
-}
-
-/*
- * Write a block of data to a given array CSR.  All calls will be in
- * multiples of 8 bytes.
- */
-static void write_rsa_data(struct hfi1_devdata *dd, int what,
-                          const u8 *data, int nbytes)
-{
-       int qw_size = nbytes / 8;
-       int i;
-
-       if (((unsigned long)data & 0x7) == 0) {
-               /* aligned */
-               u64 *ptr = (u64 *)data;
-
-               for (i = 0; i < qw_size; i++, ptr++)
-                       write_csr(dd, what + (8 * i), *ptr);
-       } else {
-               /* not aligned */
-               for (i = 0; i < qw_size; i++, data += 8) {
-                       u64 value;
-
-                       memcpy(&value, data, 8);
-                       write_csr(dd, what + (8 * i), value);
-               }
-       }
-}
-
-/*
- * Write a block of data to a given CSR as a stream of writes.  All calls will
- * be in multiples of 8 bytes.
- */
-static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
-                                   const u8 *data, int nbytes)
-{
-       u64 *ptr = (u64 *)data;
-       int qw_size = nbytes / 8;
-
-       for (; qw_size > 0; qw_size--, ptr++)
-               write_csr(dd, what, *ptr);
-}
-
-/*
- * Download the signature and start the RSA mechanism.  Wait for
- * RSA_ENGINE_TIMEOUT before giving up.
- */
-static int run_rsa(struct hfi1_devdata *dd, const char *who,
-                  const u8 *signature)
-{
-       unsigned long timeout;
-       u64 reg;
-       u32 status;
-       int ret = 0;
-
-       /* write the signature */
-       write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
-
-       /* initialize RSA */
-       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
-
-       /*
-        * Make sure the engine is idle and insert a delay between the two
-        * writes to MISC_CFG_RSA_CMD.
-        */
-       status = (read_csr(dd, MISC_CFG_FW_CTRL)
-                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
-                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
-       if (status != RSA_STATUS_IDLE) {
-               dd_dev_err(dd, "%s security engine not idle - giving up\n",
-                          who);
-               return -EBUSY;
-       }
-
-       /* start RSA */
-       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
-
-       /*
-        * Look for the result.
-        *
-        * The RSA engine is hooked up to two MISC errors.  The driver
-        * masks these errors as they do not respond to the standard
-        * error "clear down" mechanism.  Look for these errors here and
-        * clear them when possible.  This routine will exit with the
-        * errors of the current run still set.
-        *
-        * MISC_FW_AUTH_FAILED_ERR
-        *      Firmware authorization failed.  This can be cleared by
-        *      re-initializing the RSA engine, then clearing the status bit.
-        *      Do not re-init the RSA angine immediately after a successful
-        *      run - this will reset the current authorization.
-        *
-        * MISC_KEY_MISMATCH_ERR
-        *      Key does not match.  The only way to clear this is to load
-        *      a matching key then clear the status bit.  If this error
-        *      is raised, it will persist outside of this routine until a
-        *      matching key is loaded.
-        */
-       timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
-       while (1) {
-               status = (read_csr(dd, MISC_CFG_FW_CTRL)
-                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
-                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
-
-               if (status == RSA_STATUS_IDLE) {
-                       /* should not happen */
-                       dd_dev_err(dd, "%s firmware security bad idle state\n",
-                                  who);
-                       ret = -EINVAL;
-                       break;
-               } else if (status == RSA_STATUS_DONE) {
-                       /* finished successfully */
-                       break;
-               } else if (status == RSA_STATUS_FAILED) {
-                       /* finished unsuccessfully */
-                       ret = -EINVAL;
-                       break;
-               }
-               /* else still active */
-
-               if (time_after(jiffies, timeout)) {
-                       /*
-                        * Timed out while active.  We can't reset the engine
-                        * if it is stuck active, but run through the
-                        * error code to see what error bits are set.
-                        */
-                       dd_dev_err(dd, "%s firmware security time out\n", who);
-                       ret = -ETIMEDOUT;
-                       break;
-               }
-
-               msleep(20);
-       }
-
-       /*
-        * Arrive here on success or failure.  Clear all RSA engine
-        * errors.  All current errors will stick - the RSA logic is keeping
-        * error high.  All previous errors will clear - the RSA logic
-        * is not keeping the error high.
-        */
-       write_csr(dd, MISC_ERR_CLEAR,
-                 MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
-                 MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
-       /*
-        * All that is left are the current errors.  Print warnings on
-        * authorization failure details, if any.  Firmware authorization
-        * can be retried, so these are only warnings.
-        */
-       reg = read_csr(dd, MISC_ERR_STATUS);
-       if (ret) {
-               if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
-                       dd_dev_warn(dd, "%s firmware authorization failed\n",
-                                   who);
-               if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
-                       dd_dev_warn(dd, "%s firmware key mismatch\n", who);
-       }
-
-       return ret;
-}
-
-static void load_security_variables(struct hfi1_devdata *dd,
-                                   struct firmware_details *fdet)
-{
-       /* Security variables a.  Write the modulus */
-       write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
-       /* Security variables b.  Write the r2 */
-       write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
-       /* Security variables c.  Write the mu */
-       write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
-       /* Security variables d.  Write the header */
-       write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
-                               (u8 *)fdet->css_header,
-                               sizeof(struct css_header));
-}
-
-/* return the 8051 firmware state */
-static inline u32 get_firmware_state(struct hfi1_devdata *dd)
-{
-       u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
-
-       return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
-                               & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
-}
-
-/*
- * Wait until the firmware is up and ready to take host requests.
- * Return 0 on success, -ETIMEDOUT on timeout.
- */
-int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
-{
-       unsigned long timeout;
-
-       /* in the simulator, the fake 8051 is always ready */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return 0;
-
-       timeout = msecs_to_jiffies(mstimeout) + jiffies;
-       while (1) {
-               if (get_firmware_state(dd) == 0xa0)     /* ready */
-                       return 0;
-               if (time_after(jiffies, timeout))       /* timed out */
-                       return -ETIMEDOUT;
-               usleep_range(1950, 2050); /* sleep 2ms-ish */
-       }
-}
-
-/*
- * Load the 8051 firmware.
- */
-static int load_8051_firmware(struct hfi1_devdata *dd,
-                             struct firmware_details *fdet)
-{
-       u64 reg;
-       int ret;
-       u8 ver_a, ver_b;
-
-       /*
-        * DC Reset sequence
-        * Load DC 8051 firmware
-        */
-       /*
-        * DC reset step 1: Reset DC8051
-        */
-       reg = DC_DC8051_CFG_RST_M8051W_SMASK
-               | DC_DC8051_CFG_RST_CRAM_SMASK
-               | DC_DC8051_CFG_RST_DRAM_SMASK
-               | DC_DC8051_CFG_RST_IRAM_SMASK
-               | DC_DC8051_CFG_RST_SFR_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RST, reg);
-
-       /*
-        * DC reset step 2 (optional): Load 8051 data memory with link
-        * configuration
-        */
-
-       /*
-        * DC reset step 3: Load DC8051 firmware
-        */
-       /* release all but the core reset */
-       reg = DC_DC8051_CFG_RST_M8051W_SMASK;
-       write_csr(dd, DC_DC8051_CFG_RST, reg);
-
-       /* Firmware load step 1 */
-       load_security_variables(dd, fdet);
-
-       /*
-        * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
-        */
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-
-       /* Firmware load steps 3-5 */
-       ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
-                        fdet->firmware_len);
-       if (ret)
-               return ret;
-
-       /*
-        * DC reset step 4. Host starts the DC8051 firmware
-        */
-       /*
-        * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
-        */
-       write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
-
-       /* Firmware load steps 7-10 */
-       ret = run_rsa(dd, "8051", fdet->signature);
-       if (ret)
-               return ret;
-
-       /* clear all reset bits, releasing the 8051 */
-       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
-
-       /*
-        * DC reset step 5. Wait for firmware to be ready to accept host
-        * requests.
-        */
-       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
-       if (ret) { /* timed out */
-               dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
-                          get_firmware_state(dd));
-               return -ETIMEDOUT;
-       }
-
-       read_misc_status(dd, &ver_a, &ver_b);
-       dd_dev_info(dd, "8051 firmware version %d.%d\n",
-                   (int)ver_b, (int)ver_a);
-       dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
-
-       return 0;
-}
-
-/*
- * Write the SBus request register
- *
- * No need for masking - the arguments are sized exactly.
- */
-void sbus_request(struct hfi1_devdata *dd,
-                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
-{
-       write_csr(dd, ASIC_CFG_SBUS_REQUEST,
-                 ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
-                 ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
-                 ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
-                 ((u64)receiver_addr <<
-                  ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
-}
-
-/*
- * Turn off the SBus and fabric serdes spicos.
- *
- * + Must be called with Sbus fast mode turned on.
- * + Must be called after fabric serdes broadcast is set up.
- * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
- *   when using MISC_CFG_FW_CTRL.
- */
-static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
-{
-       /* only needed on A0 */
-       if (!is_ax(dd))
-               return;
-
-       dd_dev_info(dd, "Turning off spicos:%s%s\n",
-                   flags & SPICO_SBUS ? " SBus" : "",
-                   flags & SPICO_FABRIC ? " fabric" : "");
-
-       write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
-       /* disable SBus spico */
-       if (flags & SPICO_SBUS)
-               sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
-                            WRITE_SBUS_RECEIVER, 0x00000040);
-
-       /* disable the fabric serdes spicos */
-       if (flags & SPICO_FABRIC)
-               sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
-                            0x07, WRITE_SBUS_RECEIVER, 0x00000000);
-       write_csr(dd, MISC_CFG_FW_CTRL, 0);
-}
-
-/*
- * Reset all of the fabric serdes for this HFI in preparation to take the
- * link to Polling.
- *
- * To do a reset, we need to write to to the serdes registers.  Unfortunately,
- * the fabric serdes download to the other HFI on the ASIC will have turned
- * off the firmware validation on this HFI.  This means we can't write to the
- * registers to reset the serdes.  Work around this by performing a complete
- * re-download and validation of the fabric serdes firmware.  This, as a
- * by-product, will reset the serdes.  NOTE: the re-download requires that
- * the 8051 be in the Offline state.  I.e. not actively trying to use the
- * serdes.  This routine is called at the point where the link is Offline and
- * is getting ready to go to Polling.
- */
-void fabric_serdes_reset(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       if (!fw_fabric_serdes_load)
-               return;
-
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
-               return;
-       }
-       set_sbus_fast_mode(dd);
-
-       if (is_ax(dd)) {
-               /* A0 serdes do not work with a re-download */
-               u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
-
-               /* place SerDes in reset and disable SPICO */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
-               /* wait 100 refclk cycles @ 156.25MHz => 640ns */
-               udelay(1);
-               /* remove SerDes reset */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
-               /* turn SPICO enable on */
-               sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
-       } else {
-               turn_off_spicos(dd, SPICO_FABRIC);
-               /*
-                * No need for firmware retry - what to download has already
-                * been decided.
-                * No need to pay attention to the load return - the only
-                * failure is a validation failure, which has already been
-                * checked by the initial download.
-                */
-               (void)load_fabric_serdes_firmware(dd, &fw_fabric);
-       }
-
-       clear_sbus_fast_mode(dd);
-       release_chip_resource(dd, CR_SBUS);
-}
-
-/* Access to the SBus in this routine should probably be serialized */
-int sbus_request_slow(struct hfi1_devdata *dd,
-                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
-{
-       u64 reg, count = 0;
-
-       /* make sure fast mode is clear */
-       clear_sbus_fast_mode(dd);
-
-       sbus_request(dd, receiver_addr, data_addr, command, data_in);
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
-                 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
-       /* Wait for both DONE and RCV_DATA_VALID to go high */
-       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
-                (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
-               if (count++ >= SBUS_MAX_POLL_COUNT) {
-                       u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-                       /*
-                        * If the loop has timed out, we are OK if DONE bit
-                        * is set and RCV_DATA_VALID and EXECUTE counters
-                        * are the same. If not, we cannot proceed.
-                        */
-                       if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
-                           (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
-                            SBUS_COUNTER(counts, EXECUTE)))
-                               break;
-                       return -ETIMEDOUT;
-               }
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       }
-       count = 0;
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
-       /* Wait for DONE to clear after EXECUTE is cleared */
-       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
-               if (count++ >= SBUS_MAX_POLL_COUNT)
-                       return -ETIME;
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
-       }
-       return 0;
-}
-
-static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
-                                      struct firmware_details *fdet)
-{
-       int i, err;
-       const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
-
-       dd_dev_info(dd, "Downloading fabric firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: place SerDes in reset and disable SPICO */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
-       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
-       udelay(1);
-       /* step 3:  remove SerDes reset */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
-       /* step 4: assert IMEM override */
-       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
-       /* step 5: download SerDes machine code */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 6: IMEM override off */
-       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
-       /* step 7: turn ECC on */
-       sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
-
-       /* steps 8-11: run the RSA engine */
-       err = run_rsa(dd, "fabric serdes", fdet->signature);
-       if (err)
-               return err;
-
-       /* step 12: turn SPICO enable on */
-       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
-       /* step 13: enable core hardware interrupts */
-       sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
-
-       return 0;
-}
-
-static int load_sbus_firmware(struct hfi1_devdata *dd,
-                             struct firmware_details *fdet)
-{
-       int i, err;
-       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
-
-       dd_dev_info(dd, "Downloading SBus firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: place SPICO into reset and enable off */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
-       /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
-       /* step 4: set starting IMEM address for burst download */
-       sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
-       /* step 5: download the SBus Master machine code */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 6: set IMEM_CNTL_EN off */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
-       /* step 7: turn ECC on */
-       sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
-
-       /* steps 8-11: run the RSA engine */
-       err = run_rsa(dd, "SBus", fdet->signature);
-       if (err)
-               return err;
-
-       /* step 12: set SPICO_ENABLE on */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
-
-       return 0;
-}
-
-static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
-                                    struct firmware_details *fdet)
-{
-       int i;
-       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
-
-       dd_dev_info(dd, "Downloading PCIe firmware\n");
-
-       /* step 1: load security variables */
-       load_security_variables(dd, fdet);
-       /* step 2: assert single step (halts the SBus Master spico) */
-       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
-       /* step 3: enable XDMEM access */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
-       /* step 4: load firmware into SBus Master XDMEM */
-       /*
-        * NOTE: the dmem address, write_en, and wdata are all pre-packed,
-        * we only need to pick up the bytes and write them
-        */
-       for (i = 0; i < fdet->firmware_len; i += 4) {
-               sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
-                            *(u32 *)&fdet->firmware_ptr[i]);
-       }
-       /* step 5: disable XDMEM access */
-       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
-       /* step 6: allow SBus Spico to run */
-       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
-
-       /*
-        * steps 7-11: run RSA, if it succeeds, firmware is available to
-        * be swapped
-        */
-       return run_rsa(dd, "PCIe serdes", fdet->signature);
-}
-
-/*
- * Set the given broadcast values on the given list of devices.
- */
-static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
-                                const u8 *addrs, int count)
-{
-       while (--count >= 0) {
-               /*
-                * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
-                * defaults for everything else.  Do not read-modify-write,
-                * per instruction from the manufacturer.
-                *
-                * Register 0xfd:
-                *      bits    what
-                *      -----   ---------------------------------
-                *        0     IGNORE_BROADCAST  (default 0)
-                *      11:4    BROADCAST_GROUP_1 (default 0xff)
-                *      23:16   BROADCAST_GROUP_2 (default 0xff)
-                */
-               sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
-                            (u32)bg1 << 4 | (u32)bg2 << 16);
-       }
-}
-
-int acquire_hw_mutex(struct hfi1_devdata *dd)
-{
-       unsigned long timeout;
-       int try = 0;
-       u8 mask = 1 << dd->hfi1_id;
-       u8 user;
-
-retry:
-       timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
-       while (1) {
-               write_csr(dd, ASIC_CFG_MUTEX, mask);
-               user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
-               if (user == mask)
-                       return 0; /* success */
-               if (time_after(jiffies, timeout))
-                       break; /* timed out */
-               msleep(20);
-       }
-
-       /* timed out */
-       dd_dev_err(dd,
-                  "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
-                  (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
-
-       if (try == 0) {
-               /* break mutex and retry */
-               write_csr(dd, ASIC_CFG_MUTEX, 0);
-               try++;
-               goto retry;
-       }
-
-       return -EBUSY;
-}
-
-void release_hw_mutex(struct hfi1_devdata *dd)
-{
-       write_csr(dd, ASIC_CFG_MUTEX, 0);
-}
-
-/* return the given resource bit(s) as a mask for the given HFI */
-static inline u64 resource_mask(u32 hfi1_id, u32 resource)
-{
-       return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
-}
-
-static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
-                                      const char *func)
-{
-       dd_dev_err(dd,
-                  "%s: hardware mutex stuck - suggest rebooting the machine\n",
-                  func);
-}
-
-/*
- * Acquire access to a chip resource.
- *
- * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
- */
-static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
-{
-       u64 scratch0, all_bits, my_bit;
-       int ret;
-
-       if (resource & CR_DYN_MASK) {
-               /* a dynamic resource is in use if either HFI has set the bit */
-               if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
-                   (resource & (CR_I2C1 | CR_I2C2))) {
-                       /* discrete devices must serialize across both chains */
-                       all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
-                                       resource_mask(1, CR_I2C1 | CR_I2C2);
-               } else {
-                       all_bits = resource_mask(0, resource) |
-                                               resource_mask(1, resource);
-               }
-               my_bit = resource_mask(dd->hfi1_id, resource);
-       } else {
-               /* non-dynamic resources are not split between HFIs */
-               all_bits = resource;
-               my_bit = resource;
-       }
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       ret = acquire_hw_mutex(dd);
-       if (ret) {
-               fail_mutex_acquire_message(dd, __func__);
-               ret = -EIO;
-               goto done;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if (scratch0 & all_bits) {
-               ret = -EBUSY;
-       } else {
-               write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
-               /* force write to be visible to other HFI on another OS */
-               (void)read_csr(dd, ASIC_CFG_SCRATCH);
-       }
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-       return ret;
-}
-
-/*
- * Acquire access to a chip resource, wait up to mswait milliseconds for
- * the resource to become available.
- *
- * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
- * acquire failed.
- */
-int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
-{
-       unsigned long timeout;
-       int ret;
-
-       timeout = jiffies + msecs_to_jiffies(mswait);
-       while (1) {
-               ret = __acquire_chip_resource(dd, resource);
-               if (ret != -EBUSY)
-                       return ret;
-               /* resource is busy, check our timeout */
-               if (time_after_eq(jiffies, timeout))
-                       return -EBUSY;
-               usleep_range(80, 120);  /* arbitrary delay */
-       }
-}
-
-/*
- * Release access to a chip resource
- */
-void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
-{
-       u64 scratch0, bit;
-
-       /* only dynamic resources should ever be cleared */
-       if (!(resource & CR_DYN_MASK)) {
-               dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
-                          resource);
-               return;
-       }
-       bit = resource_mask(dd->hfi1_id, resource);
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       if (acquire_hw_mutex(dd)) {
-               fail_mutex_acquire_message(dd, __func__);
-               goto done;
-       }
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if ((scratch0 & bit) != 0) {
-               scratch0 &= ~bit;
-               write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-               /* force write to be visible to other HFI on another OS */
-               (void)read_csr(dd, ASIC_CFG_SCRATCH);
-       } else {
-               dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
-                           __func__, dd->hfi1_id, resource);
-       }
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-}
-
-/*
- * Return true if resource is set, false otherwise.  Print a warning
- * if not set and a function is supplied.
- */
-bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
-                        const char *func)
-{
-       u64 scratch0, bit;
-
-       if (resource & CR_DYN_MASK)
-               bit = resource_mask(dd->hfi1_id, resource);
-       else
-               bit = resource;
-
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       if ((scratch0 & bit) == 0) {
-               if (func)
-                       dd_dev_warn(dd,
-                                   "%s: id %d, resource 0x%x, not acquired!\n",
-                                   func, dd->hfi1_id, resource);
-               return false;
-       }
-       return true;
-}
-
-static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
-{
-       u64 scratch0;
-
-       /* lock against other callers within the driver wanting a resource */
-       mutex_lock(&dd->asic_data->asic_resource_mutex);
-
-       if (acquire_hw_mutex(dd)) {
-               fail_mutex_acquire_message(dd, func);
-               goto done;
-       }
-
-       /* clear all dynamic access bits for this HFI */
-       scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
-       scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
-       write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
-       /* force write to be visible to other HFI on another OS */
-       (void)read_csr(dd, ASIC_CFG_SCRATCH);
-
-       release_hw_mutex(dd);
-
-done:
-       mutex_unlock(&dd->asic_data->asic_resource_mutex);
-}
-
-void init_chip_resources(struct hfi1_devdata *dd)
-{
-       /* clear any holds left by us */
-       clear_chip_resources(dd, __func__);
-}
-
-void finish_chip_resources(struct hfi1_devdata *dd)
-{
-       /* clear any holds left by us */
-       clear_chip_resources(dd, __func__);
-}
-
-void set_sbus_fast_mode(struct hfi1_devdata *dd)
-{
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
-                 ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
-}
-
-void clear_sbus_fast_mode(struct hfi1_devdata *dd)
-{
-       u64 reg, count = 0;
-
-       reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-       while (SBUS_COUNTER(reg, EXECUTE) !=
-              SBUS_COUNTER(reg, RCV_DATA_VALID)) {
-               if (count++ >= SBUS_MAX_POLL_COUNT)
-                       break;
-               udelay(1);
-               reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
-       }
-       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
-}
-
-int load_firmware(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       if (fw_fabric_serdes_load) {
-               ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-               if (ret)
-                       return ret;
-
-               set_sbus_fast_mode(dd);
-
-               set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
-                                    fabric_serdes_broadcast[dd->hfi1_id],
-                                    fabric_serdes_addrs[dd->hfi1_id],
-                                    NUM_FABRIC_SERDES);
-               turn_off_spicos(dd, SPICO_FABRIC);
-               do {
-                       ret = load_fabric_serdes_firmware(dd, &fw_fabric);
-               } while (retry_firmware(dd, ret));
-
-               clear_sbus_fast_mode(dd);
-               release_chip_resource(dd, CR_SBUS);
-               if (ret)
-                       return ret;
-       }
-
-       if (fw_8051_load) {
-               do {
-                       ret = load_8051_firmware(dd, &fw_8051);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-int hfi1_firmware_init(struct hfi1_devdata *dd)
-{
-       /* only RTL can use these */
-       if (dd->icode != ICODE_RTL_SILICON) {
-               fw_fabric_serdes_load = 0;
-               fw_pcie_serdes_load = 0;
-               fw_sbus_load = 0;
-       }
-
-       /* no 8051 or QSFP on simulator */
-       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               fw_8051_load = 0;
-               platform_config_load = 0;
-       }
-
-       if (!fw_8051_name) {
-               if (dd->icode == ICODE_RTL_SILICON)
-                       fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
-               else
-                       fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
-       }
-       if (!fw_fabric_serdes_name)
-               fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
-       if (!fw_sbus_name)
-               fw_sbus_name = DEFAULT_FW_SBUS_NAME;
-       if (!fw_pcie_serdes_name)
-               fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
-       if (!platform_config_name)
-               platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
-
-       return obtain_firmware(dd);
-}
-
-/*
- * This function is a helper function for parse_platform_config(...) and
- * does not check for validity of the platform configuration cache
- * (because we know it is invalid as we are building up the cache).
- * As such, this should not be called from anywhere other than
- * parse_platform_config
- */
-static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
-{
-       u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-
-       if (!system_table)
-               return -EINVAL;
-
-       meta_ver_meta =
-       *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
-       + SYSTEM_TABLE_META_VERSION);
-
-       mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
-       ver_start = meta_ver_meta & mask;
-
-       meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
-
-       mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
-       ver_len = meta_ver_meta & mask;
-
-       ver_start /= 8;
-       meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
-
-       if (meta_ver < 5) {
-               dd_dev_info(
-                       dd, "%s:Please update platform config\n", __func__);
-               return -EINVAL;
-       }
-       return 0;
-}
-
-int parse_platform_config(struct hfi1_devdata *dd)
-{
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-       u32 *ptr = NULL;
-       u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
-       u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
-       int ret = -EINVAL; /* assume failure */
-
-       if (!dd->platform_config.data) {
-               dd_dev_info(dd, "%s: Missing config file\n", __func__);
-               goto bail;
-       }
-       ptr = (u32 *)dd->platform_config.data;
-
-       magic_num = *ptr;
-       ptr++;
-       if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
-               dd_dev_info(dd, "%s: Bad config file\n", __func__);
-               goto bail;
-       }
-
-       /* Field is file size in DWORDs */
-       file_length = (*ptr) * 4;
-       ptr++;
-
-       if (file_length > dd->platform_config.size) {
-               dd_dev_info(dd, "%s:File claims to be larger than read size\n",
-                           __func__);
-               goto bail;
-       } else if (file_length < dd->platform_config.size) {
-               dd_dev_info(dd,
-                           "%s:File claims to be smaller than read size, continuing\n",
-                           __func__);
-       }
-       /* exactly equal, perfection */
-
-       /*
-        * In both cases where we proceed, using the self-reported file length
-        * is the safer option
-        */
-       while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
-               header1 = *ptr;
-               header2 = *(ptr + 1);
-               if (header1 != ~header2) {
-                       dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
-                                   __func__, (ptr - (u32 *)
-                                              dd->platform_config.data));
-                       goto bail;
-               }
-
-               record_idx = *ptr &
-                       ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
-
-               table_length_dwords = (*ptr >>
-                               PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
-                     ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
-
-               table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
-                       ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
-
-               /* Done with this set of headers */
-               ptr += 2;
-
-               if (record_idx) {
-                       /* data table */
-                       switch (table_type) {
-                       case PLATFORM_CONFIG_SYSTEM_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                                       1;
-                               ret = check_meta_version(dd, ptr);
-                               if (ret)
-                                       goto bail;
-                               break;
-                       case PLATFORM_CONFIG_PORT_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                                       2;
-                               break;
-                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-                               pcfgcache->config_tables[table_type].num_table =
-                                                       table_length_dwords;
-                               break;
-                       default:
-                               dd_dev_info(dd,
-                                           "%s: Unknown data table %d, offset %ld\n",
-                                           __func__, table_type,
-                                           (ptr - (u32 *)
-                                            dd->platform_config.data));
-                               goto bail; /* We don't trust this file now */
-                       }
-                       pcfgcache->config_tables[table_type].table = ptr;
-               } else {
-                       /* metadata table */
-                       switch (table_type) {
-                       case PLATFORM_CONFIG_SYSTEM_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_PORT_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-                               /* fall through */
-                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-                               break;
-                       default:
-                               dd_dev_info(dd,
-                                           "%s: Unknown meta table %d, offset %ld\n",
-                                           __func__, table_type,
-                                           (ptr -
-                                            (u32 *)dd->platform_config.data));
-                               goto bail; /* We don't trust this file now */
-                       }
-                       pcfgcache->config_tables[table_type].table_metadata =
-                                                                       ptr;
-               }
-
-               /* Calculate and check table crc */
-               crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
-                              (table_length_dwords * 4));
-               crc ^= ~(u32)0;
-
-               /* Jump the table */
-               ptr += table_length_dwords;
-               if (crc != *ptr) {
-                       dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
-                                   __func__, (ptr -
-                                              (u32 *)
-                                              dd->platform_config.data));
-                       goto bail;
-               }
-               /* Jump the CRC DWORD */
-               ptr++;
-       }
-
-       pcfgcache->cache_valid = 1;
-       return 0;
-bail:
-       memset(pcfgcache, 0, sizeof(struct platform_config_cache));
-       return ret;
-}
-
-static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
-                                         int field, u32 *field_len_bits,
-                                         u32 *field_start_bits)
-{
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-       u32 *src_ptr = NULL;
-
-       if (!pcfgcache->cache_valid)
-               return -EINVAL;
-
-       switch (table) {
-       case PLATFORM_CONFIG_SYSTEM_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_PORT_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-               if (field && field < platform_config_table_limits[table])
-                       src_ptr =
-                       pcfgcache->config_tables[table].table_metadata + field;
-               break;
-       default:
-               dd_dev_info(dd, "%s: Unknown table\n", __func__);
-               break;
-       }
-
-       if (!src_ptr)
-               return -EINVAL;
-
-       if (field_start_bits)
-               *field_start_bits = *src_ptr &
-                     ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
-
-       if (field_len_bits)
-               *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
-                      & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
-
-       return 0;
-}
-
-/* This is the central interface to getting data out of the platform config
- * file. It depends on parse_platform_config() having populated the
- * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
- * validate the sanity of the cache.
- *
- * The non-obvious parameters:
- * @table_index: Acts as a look up key into which instance of the tables the
- * relevant field is fetched from.
- *
- * This applies to the data tables that have multiple instances. The port table
- * is an exception to this rule as each HFI only has one port and thus the
- * relevant table can be distinguished by hfi_id.
- *
- * @data: pointer to memory that will be populated with the field requested.
- * @len: length of memory pointed by @data in bytes.
- */
-int get_platform_config_field(struct hfi1_devdata *dd,
-                             enum platform_config_table_type_encoding
-                             table_type, int table_index, int field_index,
-                             u32 *data, u32 len)
-{
-       int ret = 0, wlen = 0, seek = 0;
-       u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
-       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
-
-       if (data)
-               memset(data, 0, len);
-       else
-               return -EINVAL;
-
-       ret = get_platform_fw_field_metadata(dd, table_type, field_index,
-                                            &field_len_bits,
-                                            &field_start_bits);
-       if (ret)
-               return -EINVAL;
-
-       /* Convert length to bits */
-       len *= 8;
-
-       /* Our metadata function checked cache_valid and field_index for us */
-       switch (table_type) {
-       case PLATFORM_CONFIG_SYSTEM_TABLE:
-               src_ptr = pcfgcache->config_tables[table_type].table;
-
-               if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
-                       if (len < field_len_bits)
-                               return -EINVAL;
-
-                       seek = field_start_bits / 8;
-                       wlen = field_len_bits / 8;
-
-                       src_ptr = (u32 *)((u8 *)src_ptr + seek);
-
-                       /*
-                        * We expect the field to be byte aligned and whole byte
-                        * lengths if we are here
-                        */
-                       memcpy(data, src_ptr, wlen);
-                       return 0;
-               }
-               break;
-       case PLATFORM_CONFIG_PORT_TABLE:
-               /* Port table is 4 DWORDS */
-               src_ptr = dd->hfi1_id ?
-                       pcfgcache->config_tables[table_type].table + 4 :
-                       pcfgcache->config_tables[table_type].table;
-               break;
-       case PLATFORM_CONFIG_RX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_TX_PRESET_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-               /* fall through */
-       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
-               src_ptr = pcfgcache->config_tables[table_type].table;
-
-               if (table_index <
-                       pcfgcache->config_tables[table_type].num_table)
-                       src_ptr += table_index;
-               else
-                       src_ptr = NULL;
-               break;
-       default:
-               dd_dev_info(dd, "%s: Unknown table\n", __func__);
-               break;
-       }
-
-       if (!src_ptr || len < field_len_bits)
-               return -EINVAL;
-
-       src_ptr += (field_start_bits / 32);
-       *data = (*src_ptr >> (field_start_bits % 32)) &
-                       ((1 << field_len_bits) - 1);
-
-       return 0;
-}
-
-/*
- * Download the firmware needed for the Gen3 PCIe SerDes.  An update
- * to the SBus firmware is needed before updating the PCIe firmware.
- *
- * Note: caller must be holding the SBus resource.
- */
-int load_pcie_firmware(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-
-       /* both firmware loads below use the SBus */
-       set_sbus_fast_mode(dd);
-
-       if (fw_sbus_load) {
-               turn_off_spicos(dd, SPICO_SBUS);
-               do {
-                       ret = load_sbus_firmware(dd, &fw_sbus);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       goto done;
-       }
-
-       if (fw_pcie_serdes_load) {
-               dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
-               set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
-                                    pcie_serdes_broadcast[dd->hfi1_id],
-                                    pcie_serdes_addrs[dd->hfi1_id],
-                                    NUM_PCIE_SERDES);
-               do {
-                       ret = load_pcie_serdes_firmware(dd, &fw_pcie);
-               } while (retry_firmware(dd, ret));
-               if (ret)
-                       goto done;
-       }
-
-done:
-       clear_sbus_fast_mode(dd);
-
-       return ret;
-}
-
-/*
- * Read the GUID from the hardware, store it in dd.
- */
-void read_guid(struct hfi1_devdata *dd)
-{
-       /* Take the DC out of reset to get a valid GUID value */
-       write_csr(dd, CCE_DC_CTRL, 0);
-       (void)read_csr(dd, CCE_DC_CTRL);
-
-       dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
-       dd_dev_info(dd, "GUID %llx",
-                   (unsigned long long)dd->base_guid);
-}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
deleted file mode 100644 (file)
index 7b78d56..0000000
+++ /dev/null
@@ -1,1949 +0,0 @@
-#ifndef _HFI1_KERNEL_H
-#define _HFI1_KERNEL_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/fs.h>
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/sched.h>
-#include <linux/cdev.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <rdma/rdma_vt.h>
-
-#include "chip_registers.h"
-#include "common.h"
-#include "verbs.h"
-#include "pio.h"
-#include "chip.h"
-#include "mad.h"
-#include "qsfp.h"
-#include "platform.h"
-#include "affinity.h"
-
-/* bumped 1 from s/w major version of TrueScale */
-#define HFI1_CHIP_VERS_MAJ 3U
-
-/* don't care about this except printing */
-#define HFI1_CHIP_VERS_MIN 0U
-
-/* The Organization Unique Identifier (Mfg code), and its position in GUID */
-#define HFI1_OUI 0x001175
-#define HFI1_OUI_LSB 40
-
-#define DROP_PACKET_OFF                0
-#define DROP_PACKET_ON         1
-
-extern unsigned long hfi1_cap_mask;
-#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
-#define HFI1_CAP_UGET_MASK(mask, cap) \
-       (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
-#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
-#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
-#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
-#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
-#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
-                       HFI1_CAP_MISC_MASK)
-/* Offline Disabled Reason is 4-bits */
-#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
-
-/*
- * Control context is always 0 and handles the error packets.
- * It also handles the VL15 and multicast packets.
- */
-#define HFI1_CTRL_CTXT    0
-
-/*
- * Driver context will store software counters for each of the events
- * associated with these status registers
- */
-#define NUM_CCE_ERR_STATUS_COUNTERS 41
-#define NUM_RCV_ERR_STATUS_COUNTERS 64
-#define NUM_MISC_ERR_STATUS_COUNTERS 13
-#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
-#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
-#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
-#define NUM_SEND_ERR_STATUS_COUNTERS 3
-#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
-#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
-
-/*
- * per driver stats, either not device nor port-specific, or
- * summed over all of the devices and ports.
- * They are described by name via ipathfs filesystem, so layout
- * and number of elements can change without breaking compatibility.
- * If members are added or deleted hfi1_statnames[] in debugfs.c must
- * change to match.
- */
-struct hfi1_ib_stats {
-       __u64 sps_ints; /* number of interrupts handled */
-       __u64 sps_errints; /* number of error interrupts */
-       __u64 sps_txerrs; /* tx-related packet errors */
-       __u64 sps_rcverrs; /* non-crc rcv packet errors */
-       __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
-       __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
-       __u64 sps_ctxts; /* number of contexts currently open */
-       __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
-       __u64 sps_buffull;
-       __u64 sps_hdrfull;
-};
-
-extern struct hfi1_ib_stats hfi1_stats;
-extern const struct pci_error_handlers hfi1_pci_err_handler;
-
-/*
- * First-cut criterion for "device is active" is
- * two thousand dwords combined Tx, Rx traffic per
- * 5-second interval. SMA packets are 64 dwords,
- * and occur "a few per second", presumably each way.
- */
-#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
-
-/*
- * Below contains all data related to a single context (formerly called port).
- */
-
-#ifdef CONFIG_DEBUG_FS
-struct hfi1_opcode_stats_perctx;
-#endif
-
-struct ctxt_eager_bufs {
-       ssize_t size;            /* total size of eager buffers */
-       u32 count;               /* size of buffers array */
-       u32 numbufs;             /* number of buffers allocated */
-       u32 alloced;             /* number of rcvarray entries used */
-       u32 rcvtid_size;         /* size of each eager rcv tid */
-       u32 threshold;           /* head update threshold */
-       struct eager_buffer {
-               void *addr;
-               dma_addr_t phys;
-               ssize_t len;
-       } *buffers;
-       struct {
-               void *addr;
-               dma_addr_t phys;
-       } *rcvtids;
-};
-
-struct exp_tid_set {
-       struct list_head list;
-       u32 count;
-};
-
-struct hfi1_ctxtdata {
-       /* shadow the ctxt's RcvCtrl register */
-       u64 rcvctrl;
-       /* rcvhdrq base, needs mmap before useful */
-       void *rcvhdrq;
-       /* kernel virtual address where hdrqtail is updated */
-       volatile __le64 *rcvhdrtail_kvaddr;
-       /*
-        * Shared page for kernel to signal user processes that send buffers
-        * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
-        * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
-        */
-       unsigned long *user_event_mask;
-       /* when waiting for rcv or pioavail */
-       wait_queue_head_t wait;
-       /* rcvhdrq size (for freeing) */
-       size_t rcvhdrq_size;
-       /* number of rcvhdrq entries */
-       u16 rcvhdrq_cnt;
-       /* size of each of the rcvhdrq entries */
-       u16 rcvhdrqentsize;
-       /* mmap of hdrq, must fit in 44 bits */
-       dma_addr_t rcvhdrq_phys;
-       dma_addr_t rcvhdrqtailaddr_phys;
-       struct ctxt_eager_bufs egrbufs;
-       /* this receive context's assigned PIO ACK send context */
-       struct send_context *sc;
-
-       /* dynamic receive available interrupt timeout */
-       u32 rcvavail_timeout;
-       /*
-        * number of opens (including slave sub-contexts) on this instance
-        * (ignoring forks, dup, etc. for now)
-        */
-       int cnt;
-       /*
-        * how much space to leave at start of eager TID entries for
-        * protocol use, on each TID
-        */
-       /* instead of calculating it */
-       unsigned ctxt;
-       /* non-zero if ctxt is being shared. */
-       u16 subctxt_cnt;
-       /* non-zero if ctxt is being shared. */
-       u16 subctxt_id;
-       u8 uuid[16];
-       /* job key */
-       u16 jkey;
-       /* number of RcvArray groups for this context. */
-       u32 rcv_array_groups;
-       /* index of first eager TID entry. */
-       u32 eager_base;
-       /* number of expected TID entries */
-       u32 expected_count;
-       /* index of first expected TID entry. */
-       u32 expected_base;
-
-       struct exp_tid_set tid_group_list;
-       struct exp_tid_set tid_used_list;
-       struct exp_tid_set tid_full_list;
-
-       /* lock protecting all Expected TID data */
-       struct mutex exp_lock;
-       /* number of pio bufs for this ctxt (all procs, if shared) */
-       u32 piocnt;
-       /* first pio buffer for this ctxt */
-       u32 pio_base;
-       /* chip offset of PIO buffers for this ctxt */
-       u32 piobufs;
-       /* per-context configuration flags */
-       u32 flags;
-       /* per-context event flags for fileops/intr communication */
-       unsigned long event_flags;
-       /* WAIT_RCV that timed out, no interrupt */
-       u32 rcvwait_to;
-       /* WAIT_PIO that timed out, no interrupt */
-       u32 piowait_to;
-       /* WAIT_RCV already happened, no wait */
-       u32 rcvnowait;
-       /* WAIT_PIO already happened, no wait */
-       u32 pionowait;
-       /* total number of polled urgent packets */
-       u32 urgent;
-       /* saved total number of polled urgent packets for poll edge trigger */
-       u32 urgent_poll;
-       /* pid of process using this ctxt */
-       pid_t pid;
-       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
-       /* same size as task_struct .comm[], command that opened context */
-       char comm[TASK_COMM_LEN];
-       /* so file ops can get at unit */
-       struct hfi1_devdata *dd;
-       /* so functions that need physical port can get it easily */
-       struct hfi1_pportdata *ppd;
-       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
-       void *subctxt_uregbase;
-       /* An array of pages for the eager receive buffers * N */
-       void *subctxt_rcvegrbuf;
-       /* An array of pages for the eager header queue entries * N */
-       void *subctxt_rcvhdr_base;
-       /* The version of the library which opened this ctxt */
-       u32 userversion;
-       /* Bitmask of active slaves */
-       u32 active_slaves;
-       /* Type of packets or conditions we want to poll for */
-       u16 poll_type;
-       /* receive packet sequence counter */
-       u8 seq_cnt;
-       u8 redirect_seq_cnt;
-       /* ctxt rcvhdrq head offset */
-       u32 head;
-       u32 pkt_count;
-       /* QPs waiting for context processing */
-       struct list_head qp_wait_list;
-       /* interrupt handling */
-       u64 imask;      /* clear interrupt mask */
-       int ireg;       /* clear interrupt register */
-       unsigned numa_id; /* numa node of this context */
-       /* verbs stats per CTX */
-       struct hfi1_opcode_stats_perctx *opstats;
-       /*
-        * This is the kernel thread that will keep making
-        * progress on the user sdma requests behind the scenes.
-        * There is one per context (shared contexts use the master's).
-        */
-       struct task_struct *progress;
-       struct list_head sdma_queues;
-       /* protect sdma queues */
-       spinlock_t sdma_qlock;
-
-       /* Is ASPM interrupt supported for this context */
-       bool aspm_intr_supported;
-       /* ASPM state (enabled/disabled) for this context */
-       bool aspm_enabled;
-       /* Timer for re-enabling ASPM if interrupt activity quietens down */
-       struct timer_list aspm_timer;
-       /* Lock to serialize between intr, timer intr and user threads */
-       spinlock_t aspm_lock;
-       /* Is ASPM processing enabled for this context (in intr context) */
-       bool aspm_intr_enable;
-       /* Last interrupt timestamp */
-       ktime_t aspm_ts_last_intr;
-       /* Last timestamp at which we scheduled a timer for this context */
-       ktime_t aspm_ts_timer_sched;
-
-       /*
-        * The interrupt handler for a particular receive context can vary
-        * throughout it's lifetime. This is not a lock protected data member so
-        * it must be updated atomically and the prev and new value must always
-        * be valid. Worst case is we process an extra interrupt and up to 64
-        * packets with the wrong interrupt handler.
-        */
-       int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
-};
-
-/*
- * Represents a single packet at a high level. Put commonly computed things in
- * here so we do not have to keep doing them over and over. The rule of thumb is
- * if something is used one time to derive some value, store that something in
- * here. If it is used multiple times, then store the result of that derivation
- * in here.
- */
-struct hfi1_packet {
-       void *ebuf;
-       void *hdr;
-       struct hfi1_ctxtdata *rcd;
-       __le32 *rhf_addr;
-       struct rvt_qp *qp;
-       struct hfi1_other_headers *ohdr;
-       u64 rhf;
-       u32 maxcnt;
-       u32 rhqoff;
-       u32 hdrqtail;
-       int numpkt;
-       u16 tlen;
-       u16 hlen;
-       s16 etail;
-       u16 rsize;
-       u8 updegr;
-       u8 rcv_flags;
-       u8 etype;
-};
-
-static inline bool has_sc4_bit(struct hfi1_packet *p)
-{
-       return !!rhf_dc_info(p->rhf);
-}
-
-/*
- * Private data for snoop/capture support.
- */
-struct hfi1_snoop_data {
-       int mode_flag;
-       struct cdev cdev;
-       struct device *class_dev;
-       /* protect snoop data */
-       spinlock_t snoop_lock;
-       struct list_head queue;
-       wait_queue_head_t waitq;
-       void *filter_value;
-       int (*filter_callback)(void *hdr, void *data, void *value);
-       u64 dcc_cfg; /* saved value of DCC Cfg register */
-};
-
-/* snoop mode_flag values */
-#define HFI1_PORT_SNOOP_MODE     1U
-#define HFI1_PORT_CAPTURE_MODE   2U
-
-struct rvt_sge_state;
-
-/*
- * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
- * Mostly for MADs that set or query link parameters, also ipath
- * config interfaces
- */
-#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
-#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
-#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
-#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
-#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
-#define HFI1_IB_CFG_SPD 5 /* current Link spd */
-#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
-#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
-#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
-#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
-#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
-#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
-#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
-#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
-#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
-#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
-#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
-#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
-#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
-#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
-#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
-
-/*
- * HFI or Host Link States
- *
- * These describe the states the driver thinks the logical and physical
- * states are in.  Used as an argument to set_link_state().  Implemented
- * as bits for easy multi-state checking.  The actual state can only be
- * one.
- */
-#define __HLS_UP_INIT_BP       0
-#define __HLS_UP_ARMED_BP      1
-#define __HLS_UP_ACTIVE_BP     2
-#define __HLS_DN_DOWNDEF_BP    3       /* link down default */
-#define __HLS_DN_POLL_BP       4
-#define __HLS_DN_DISABLE_BP    5
-#define __HLS_DN_OFFLINE_BP    6
-#define __HLS_VERIFY_CAP_BP    7
-#define __HLS_GOING_UP_BP      8
-#define __HLS_GOING_OFFLINE_BP  9
-#define __HLS_LINK_COOLDOWN_BP 10
-
-#define HLS_UP_INIT      BIT(__HLS_UP_INIT_BP)
-#define HLS_UP_ARMED     BIT(__HLS_UP_ARMED_BP)
-#define HLS_UP_ACTIVE    BIT(__HLS_UP_ACTIVE_BP)
-#define HLS_DN_DOWNDEF   BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
-#define HLS_DN_POLL      BIT(__HLS_DN_POLL_BP)
-#define HLS_DN_DISABLE   BIT(__HLS_DN_DISABLE_BP)
-#define HLS_DN_OFFLINE   BIT(__HLS_DN_OFFLINE_BP)
-#define HLS_VERIFY_CAP   BIT(__HLS_VERIFY_CAP_BP)
-#define HLS_GOING_UP     BIT(__HLS_GOING_UP_BP)
-#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
-#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
-
-#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
-
-/* use this MTU size if none other is given */
-#define HFI1_DEFAULT_ACTIVE_MTU 10240
-/* use this MTU size as the default maximum */
-#define HFI1_DEFAULT_MAX_MTU 10240
-/* default partition key */
-#define DEFAULT_PKEY 0xffff
-
-/*
- * Possible fabric manager config parameters for fm_{get,set}_table()
- */
-#define FM_TBL_VL_HIGH_ARB             1 /* Get/set VL high prio weights */
-#define FM_TBL_VL_LOW_ARB              2 /* Get/set VL low prio weights */
-#define FM_TBL_BUFFER_CONTROL          3 /* Get/set Buffer Control */
-#define FM_TBL_SC2VLNT                 4 /* Get/set SC->VLnt */
-#define FM_TBL_VL_PREEMPT_ELEMS                5 /* Get (no set) VL preempt elems */
-#define FM_TBL_VL_PREEMPT_MATRIX       6 /* Get (no set) VL preempt matrix */
-
-/*
- * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
- * these are bits so they can be combined, e.g.
- * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
- */
-#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
-#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
-#define HFI1_RCVCTRL_CTXT_ENB 0x04
-#define HFI1_RCVCTRL_CTXT_DIS 0x08
-#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
-#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
-#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
-#define HFI1_RCVCTRL_PKEY_DIS 0x80
-#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
-#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
-#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
-#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
-#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
-#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
-#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
-#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
-
-/* partition enforcement flags */
-#define HFI1_PART_ENFORCE_IN   0x1
-#define HFI1_PART_ENFORCE_OUT  0x2
-
-/* how often we check for synthetic counter wrap around */
-#define SYNTH_CNT_TIME 2
-
-/* Counter flags */
-#define CNTR_NORMAL            0x0 /* Normal counters, just read register */
-#define CNTR_SYNTH             0x1 /* Synthetic counters, saturate at all 1s */
-#define CNTR_DISABLED          0x2 /* Disable this counter */
-#define CNTR_32BIT             0x4 /* Simulate 64 bits for this counter */
-#define CNTR_VL                        0x8 /* Per VL counter */
-#define CNTR_SDMA              0x10
-#define CNTR_INVALID_VL                -1  /* Specifies invalid VL */
-#define CNTR_MODE_W            0x0
-#define CNTR_MODE_R            0x1
-
-/* VLs Supported/Operational */
-#define HFI1_MIN_VLS_SUPPORTED 1
-#define HFI1_MAX_VLS_SUPPORTED 8
-
-static inline void incr_cntr64(u64 *cntr)
-{
-       if (*cntr < (u64)-1LL)
-               (*cntr)++;
-}
-
-static inline void incr_cntr32(u32 *cntr)
-{
-       if (*cntr < (u32)-1LL)
-               (*cntr)++;
-}
-
-#define MAX_NAME_SIZE 64
-struct hfi1_msix_entry {
-       enum irq_type type;
-       struct msix_entry msix;
-       void *arg;
-       char name[MAX_NAME_SIZE];
-       cpumask_t mask;
-};
-
-/* per-SL CCA information */
-struct cca_timer {
-       struct hrtimer hrtimer;
-       struct hfi1_pportdata *ppd; /* read-only */
-       int sl; /* read-only */
-       u16 ccti; /* read/write - current value of CCTI */
-};
-
-struct link_down_reason {
-       /*
-        * SMA-facing value.  Should be set from .latest when
-        * HLS_UP_* -> HLS_DN_* transition actually occurs.
-        */
-       u8 sma;
-       u8 latest;
-};
-
-enum {
-       LO_PRIO_TABLE,
-       HI_PRIO_TABLE,
-       MAX_PRIO_TABLE
-};
-
-struct vl_arb_cache {
-       /* protect vl arb cache */
-       spinlock_t lock;
-       struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
-};
-
-/*
- * The structure below encapsulates data relevant to a physical IB Port.
- * Current chips support only one such port, but the separation
- * clarifies things a bit. Note that to conform to IB conventions,
- * port-numbers are one-based. The first or only port is port1.
- */
-struct hfi1_pportdata {
-       struct hfi1_ibport ibport_data;
-
-       struct hfi1_devdata *dd;
-       struct kobject pport_cc_kobj;
-       struct kobject sc2vl_kobj;
-       struct kobject sl2sc_kobj;
-       struct kobject vl2mtu_kobj;
-
-       /* PHY support */
-       u32 port_type;
-       struct qsfp_data qsfp_info;
-
-       /* GUID for this interface, in host order */
-       u64 guid;
-       /* GUID for peer interface, in host order */
-       u64 neighbor_guid;
-
-       /* up or down physical link state */
-       u32 linkup;
-
-       /*
-        * this address is mapped read-only into user processes so they can
-        * get status cheaply, whenever they want.  One qword of status per port
-        */
-       u64 *statusp;
-
-       /* SendDMA related entries */
-
-       struct workqueue_struct *hfi1_wq;
-
-       /* move out of interrupt context */
-       struct work_struct link_vc_work;
-       struct work_struct link_up_work;
-       struct work_struct link_down_work;
-       struct work_struct sma_message_work;
-       struct work_struct freeze_work;
-       struct work_struct link_downgrade_work;
-       struct work_struct link_bounce_work;
-       /* host link state variables */
-       struct mutex hls_lock;
-       u32 host_link_state;
-
-       spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
-
-       u32 lstate;     /* logical link state */
-
-       /* these are the "32 bit" regs */
-
-       u32 ibmtu; /* The MTU programmed for this unit */
-       /*
-        * Current max size IB packet (in bytes) including IB headers, that
-        * we can send. Changes when ibmtu changes.
-        */
-       u32 ibmaxlen;
-       u32 current_egress_rate; /* units [10^6 bits/sec] */
-       /* LID programmed for this instance */
-       u16 lid;
-       /* list of pkeys programmed; 0 if not set */
-       u16 pkeys[MAX_PKEY_VALUES];
-       u16 link_width_supported;
-       u16 link_width_downgrade_supported;
-       u16 link_speed_supported;
-       u16 link_width_enabled;
-       u16 link_width_downgrade_enabled;
-       u16 link_speed_enabled;
-       u16 link_width_active;
-       u16 link_width_downgrade_tx_active;
-       u16 link_width_downgrade_rx_active;
-       u16 link_speed_active;
-       u8 vls_supported;
-       u8 vls_operational;
-       u8 actual_vls_operational;
-       /* LID mask control */
-       u8 lmc;
-       /* Rx Polarity inversion (compensate for ~tx on partner) */
-       u8 rx_pol_inv;
-
-       u8 hw_pidx;     /* physical port index */
-       u8 port;        /* IB port number and index into dd->pports - 1 */
-       /* type of neighbor node */
-       u8 neighbor_type;
-       u8 neighbor_normal;
-       u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
-       u8 neighbor_port_number;
-       u8 is_sm_config_started;
-       u8 offline_disabled_reason;
-       u8 is_active_optimize_enabled;
-       u8 driver_link_ready;   /* driver ready for active link */
-       u8 link_enabled;        /* link enabled? */
-       u8 linkinit_reason;
-       u8 local_tx_rate;       /* rate given to 8051 firmware */
-       u8 last_pstate;         /* info only */
-
-       /* placeholders for IB MAD packet settings */
-       u8 overrun_threshold;
-       u8 phy_error_threshold;
-
-       /* Used to override LED behavior for things like maintenance beaconing*/
-       /*
-        * Alternates per phase of blink
-        * [0] holds LED off duration, [1] holds LED on duration
-        */
-       unsigned long led_override_vals[2];
-       u8 led_override_phase; /* LSB picks from vals[] */
-       atomic_t led_override_timer_active;
-       /* Used to flash LEDs in override mode */
-       struct timer_list led_override_timer;
-
-       u32 sm_trap_qp;
-       u32 sa_qp;
-
-       /*
-        * cca_timer_lock protects access to the per-SL cca_timer
-        * structures (specifically the ccti member).
-        */
-       spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
-       struct cca_timer cca_timer[OPA_MAX_SLS];
-
-       /* List of congestion control table entries */
-       struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
-
-       /* congestion entries, each entry corresponding to a SL */
-       struct opa_congestion_setting_entry_shadow
-               congestion_entries[OPA_MAX_SLS];
-
-       /*
-        * cc_state_lock protects (write) access to the per-port
-        * struct cc_state.
-        */
-       spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
-
-       struct cc_state __rcu *cc_state;
-
-       /* Total number of congestion control table entries */
-       u16 total_cct_entry;
-
-       /* Bit map identifying service level */
-       u32 cc_sl_control_map;
-
-       /* CA's max number of 64 entry units in the congestion control table */
-       u8 cc_max_table_entries;
-
-       /*
-        * begin congestion log related entries
-        * cc_log_lock protects all congestion log related data
-        */
-       spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
-       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
-       u16 threshold_event_counter;
-       struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
-       int cc_log_idx; /* index for logging events */
-       int cc_mad_idx; /* index for reporting events */
-       /* end congestion log related entries */
-
-       struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
-
-       /* port relative counter buffer */
-       u64 *cntrs;
-       /* port relative synthetic counter buffer */
-       u64 *scntrs;
-       /* port_xmit_discards are synthesized from different egress errors */
-       u64 port_xmit_discards;
-       u64 port_xmit_discards_vl[C_VL_COUNT];
-       u64 port_xmit_constraint_errors;
-       u64 port_rcv_constraint_errors;
-       /* count of 'link_err' interrupts from DC */
-       u64 link_downed;
-       /* number of times link retrained successfully */
-       u64 link_up;
-       /* number of times a link unknown frame was reported */
-       u64 unknown_frame_count;
-       /* port_ltp_crc_mode is returned in 'portinfo' MADs */
-       u16 port_ltp_crc_mode;
-       /* port_crc_mode_enabled is the crc we support */
-       u8 port_crc_mode_enabled;
-       /* mgmt_allowed is also returned in 'portinfo' MADs */
-       u8 mgmt_allowed;
-       u8 part_enforce; /* partition enforcement flags */
-       struct link_down_reason local_link_down_reason;
-       struct link_down_reason neigh_link_down_reason;
-       /* Value to be sent to link peer on LinkDown .*/
-       u8 remote_link_down_reason;
-       /* Error events that will cause a port bounce. */
-       u32 port_error_action;
-       struct work_struct linkstate_active_work;
-       /* Does this port need to prescan for FECNs */
-       bool cc_prescan;
-};
-
-typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
-
-typedef void (*opcode_handler)(struct hfi1_packet *packet);
-
-/* return values for the RHF receive functions */
-#define RHF_RCV_CONTINUE  0    /* keep going */
-#define RHF_RCV_DONE     1     /* stop, this packet processed */
-#define RHF_RCV_REPROCESS 2    /* stop. retain this packet */
-
-struct rcv_array_data {
-       u8 group_size;
-       u16 ngroups;
-       u16 nctxt_extra;
-};
-
-struct per_vl_data {
-       u16 mtu;
-       struct send_context *sc;
-};
-
-/* 16 to directly index */
-#define PER_VL_SEND_CONTEXTS 16
-
-struct err_info_rcvport {
-       u8 status_and_code;
-       u64 packet_flit1;
-       u64 packet_flit2;
-};
-
-struct err_info_constraint {
-       u8 status;
-       u16 pkey;
-       u32 slid;
-};
-
-struct hfi1_temp {
-       unsigned int curr;       /* current temperature */
-       unsigned int lo_lim;     /* low temperature limit */
-       unsigned int hi_lim;     /* high temperature limit */
-       unsigned int crit_lim;   /* critical temperature limit */
-       u8 triggers;      /* temperature triggers */
-};
-
-/* common data between shared ASIC HFIs */
-struct hfi1_asic_data {
-       struct hfi1_devdata *dds[2];    /* back pointers */
-       struct mutex asic_resource_mutex;
-};
-
-/* device data struct now contains only "general per-device" info.
- * fields related to a physical IB port are in a hfi1_pportdata struct.
- */
-struct sdma_engine;
-struct sdma_vl_map;
-
-#define BOARD_VERS_MAX 96 /* how long the version string can be */
-#define SERIAL_MAX 16 /* length of the serial number */
-
-typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
-struct hfi1_devdata {
-       struct hfi1_ibdev verbs_dev;     /* must be first */
-       struct list_head list;
-       /* pointers to related structs for this device */
-       /* pci access data structure */
-       struct pci_dev *pcidev;
-       struct cdev user_cdev;
-       struct cdev diag_cdev;
-       struct cdev ui_cdev;
-       struct device *user_device;
-       struct device *diag_device;
-       struct device *ui_device;
-
-       /* mem-mapped pointer to base of chip regs */
-       u8 __iomem *kregbase;
-       /* end of mem-mapped chip space excluding sendbuf and user regs */
-       u8 __iomem *kregend;
-       /* physical address of chip for io_remap, etc. */
-       resource_size_t physaddr;
-       /* receive context data */
-       struct hfi1_ctxtdata **rcd;
-       /* send context data */
-       struct send_context_info *send_contexts;
-       /* map hardware send contexts to software index */
-       u8 *hw_to_sw;
-       /* spinlock for allocating and releasing send context resources */
-       spinlock_t sc_lock;
-       /* Per VL data. Enough for all VLs but not all elements are set/used. */
-       struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
-       /* lock for pio_map */
-       spinlock_t pio_map_lock;
-       /* array of kernel send contexts */
-       struct send_context **kernel_send_context;
-       /* array of vl maps */
-       struct pio_vl_map __rcu *pio_map;
-       /* seqlock for sc2vl */
-       seqlock_t sc2vl_lock;
-       u64 sc2vl[4];
-       /* Send Context initialization lock. */
-       spinlock_t sc_init_lock;
-
-       /* fields common to all SDMA engines */
-
-       /* default flags to last descriptor */
-       u64 default_desc1;
-       volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
-       dma_addr_t                          sdma_heads_phys;
-       void                               *sdma_pad_dma; /* DMA'ed by chip */
-       dma_addr_t                          sdma_pad_phys;
-       /* for deallocation */
-       size_t                              sdma_heads_size;
-       /* number from the chip */
-       u32                                 chip_sdma_engines;
-       /* num used */
-       u32                                 num_sdma;
-       /* lock for sdma_map */
-       spinlock_t                          sde_map_lock;
-       /* array of engines sized by num_sdma */
-       struct sdma_engine                 *per_sdma;
-       /* array of vl maps */
-       struct sdma_vl_map __rcu           *sdma_map;
-       /* SPC freeze waitqueue and variable */
-       wait_queue_head_t                 sdma_unfreeze_wq;
-       atomic_t                          sdma_unfreeze_count;
-
-       /* common data between shared ASIC HFIs in this OS */
-       struct hfi1_asic_data *asic_data;
-
-       /* hfi1_pportdata, points to array of (physical) port-specific
-        * data structs, indexed by pidx (0..n-1)
-        */
-       struct hfi1_pportdata *pport;
-
-       /* mem-mapped pointer to base of PIO buffers */
-       void __iomem *piobase;
-       /*
-        * write-combining mem-mapped pointer to base of RcvArray
-        * memory.
-        */
-       void __iomem *rcvarray_wc;
-       /*
-        * credit return base - a per-NUMA range of DMA address that
-        * the chip will use to update the per-context free counter
-        */
-       struct credit_return_base *cr_base;
-
-       /* send context numbers and sizes for each type */
-       struct sc_config_sizes sc_sizes[SC_MAX];
-
-       u32 lcb_access_count;           /* count of LCB users */
-
-       char *boardname; /* human readable board info */
-
-       /* device (not port) flags, basically device capabilities */
-       u32 flags;
-
-       /* reset value */
-       u64 z_int_counter;
-       u64 z_rcv_limit;
-       u64 z_send_schedule;
-       /* percpu int_counter */
-       u64 __percpu *int_counter;
-       u64 __percpu *rcv_limit;
-       u64 __percpu *send_schedule;
-       /* number of receive contexts in use by the driver */
-       u32 num_rcv_contexts;
-       /* number of pio send contexts in use by the driver */
-       u32 num_send_contexts;
-       /*
-        * number of ctxts available for PSM open
-        */
-       u32 freectxts;
-       /* total number of available user/PSM contexts */
-       u32 num_user_contexts;
-       /* base receive interrupt timeout, in CSR units */
-       u32 rcv_intr_timeout_csr;
-
-       u64 __iomem *egrtidbase;
-       spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
-       spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
-       /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
-       spinlock_t uctxt_lock; /* rcd and user context changes */
-       /* exclusive access to 8051 */
-       spinlock_t dc8051_lock;
-       /* exclusive access to 8051 memory */
-       spinlock_t dc8051_memlock;
-       int dc8051_timed_out;   /* remember if the 8051 timed out */
-       /*
-        * A page that will hold event notification bitmaps for all
-        * contexts. This page will be mapped into all processes.
-        */
-       unsigned long *events;
-       /*
-        * per unit status, see also portdata statusp
-        * mapped read-only into user processes so they can get unit and
-        * IB link status cheaply
-        */
-       struct hfi1_status *status;
-       u32 freezelen; /* max length of freezemsg */
-
-       /* revision register shadow */
-       u64 revision;
-       /* Base GUID for device (network order) */
-       u64 base_guid;
-
-       /* these are the "32 bit" regs */
-
-       /* value we put in kr_rcvhdrsize */
-       u32 rcvhdrsize;
-       /* number of receive contexts the chip supports */
-       u32 chip_rcv_contexts;
-       /* number of receive array entries */
-       u32 chip_rcv_array_count;
-       /* number of PIO send contexts the chip supports */
-       u32 chip_send_contexts;
-       /* number of bytes in the PIO memory buffer */
-       u32 chip_pio_mem_size;
-       /* number of bytes in the SDMA memory buffer */
-       u32 chip_sdma_mem_size;
-
-       /* size of each rcvegrbuffer */
-       u32 rcvegrbufsize;
-       /* log2 of above */
-       u16 rcvegrbufsize_shift;
-       /* both sides of the PCIe link are gen3 capable */
-       u8 link_gen3_capable;
-       /* localbus width (1, 2,4,8,16,32) from config space  */
-       u32 lbus_width;
-       /* localbus speed in MHz */
-       u32 lbus_speed;
-       int unit; /* unit # of this chip */
-       int node; /* home node of this chip */
-
-       /* save these PCI fields to restore after a reset */
-       u32 pcibar0;
-       u32 pcibar1;
-       u32 pci_rom;
-       u16 pci_command;
-       u16 pcie_devctl;
-       u16 pcie_lnkctl;
-       u16 pcie_devctl2;
-       u32 pci_msix0;
-       u32 pci_lnkctl3;
-       u32 pci_tph2;
-
-       /*
-        * ASCII serial number, from flash, large enough for original
-        * all digit strings, and longer serial number format
-        */
-       u8 serial[SERIAL_MAX];
-       /* human readable board version */
-       u8 boardversion[BOARD_VERS_MAX];
-       u8 lbus_info[32]; /* human readable localbus info */
-       /* chip major rev, from CceRevision */
-       u8 majrev;
-       /* chip minor rev, from CceRevision */
-       u8 minrev;
-       /* hardware ID */
-       u8 hfi1_id;
-       /* implementation code */
-       u8 icode;
-       /* default link down value (poll/sleep) */
-       u8 link_default;
-       /* vAU of this device */
-       u8 vau;
-       /* vCU of this device */
-       u8 vcu;
-       /* link credits of this device */
-       u16 link_credits;
-       /* initial vl15 credits to use */
-       u16 vl15_init;
-
-       /* Misc small ints */
-       /* Number of physical ports available */
-       u8 num_pports;
-       /* Lowest context number which can be used by user processes */
-       u8 first_user_ctxt;
-       u8 n_krcv_queues;
-       u8 qos_shift;
-       u8 qpn_mask;
-
-       u16 rhf_offset; /* offset of RHF within receive header entry */
-       u16 irev;       /* implementation revision */
-       u16 dc8051_ver; /* 8051 firmware version */
-
-       struct platform_config platform_config;
-       struct platform_config_cache pcfg_cache;
-
-       struct diag_client *diag_client;
-       spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
-
-       u8 psxmitwait_supported;
-       /* cycle length of PS* counters in HW (in picoseconds) */
-       u16 psxmitwait_check_rate;
-       /* high volume overflow errors deferred to tasklet */
-       struct tasklet_struct error_tasklet;
-
-       /* MSI-X information */
-       struct hfi1_msix_entry *msix_entries;
-       u32 num_msix_entries;
-
-       /* INTx information */
-       u32 requested_intx_irq;         /* did we request one? */
-       char intx_name[MAX_NAME_SIZE];  /* INTx name */
-
-       /* general interrupt: mask of handled interrupts */
-       u64 gi_mask[CCE_NUM_INT_CSRS];
-
-       struct rcv_array_data rcv_entries;
-
-       /*
-        * 64 bit synthetic counters
-        */
-       struct timer_list synth_stats_timer;
-
-       /*
-        * device counters
-        */
-       char *cntrnames;
-       size_t cntrnameslen;
-       size_t ndevcntrs;
-       u64 *cntrs;
-       u64 *scntrs;
-
-       /*
-        * remembered values for synthetic counters
-        */
-       u64 last_tx;
-       u64 last_rx;
-
-       /*
-        * per-port counters
-        */
-       size_t nportcntrs;
-       char *portcntrnames;
-       size_t portcntrnameslen;
-
-       struct hfi1_snoop_data hfi1_snoop;
-
-       struct err_info_rcvport err_info_rcvport;
-       struct err_info_constraint err_info_rcv_constraint;
-       struct err_info_constraint err_info_xmit_constraint;
-       u8 err_info_uncorrectable;
-       u8 err_info_fmconfig;
-
-       atomic_t drop_packet;
-       u8 do_drop;
-
-       /*
-        * Software counters for the status bits defined by the
-        * associated error status registers
-        */
-       u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
-       u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
-       u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
-       u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
-       u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
-       u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
-       u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
-
-       /* Software counter that spans all contexts */
-       u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
-       /* Software counter that spans all DMA engines */
-       u64 sw_send_dma_eng_err_status_cnt[
-               NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
-       /* Software counter that aggregates all cce_err_status errors */
-       u64 sw_cce_err_status_aggregate;
-
-       /* receive interrupt functions */
-       rhf_rcv_function_ptr *rhf_rcv_function_map;
-       rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
-
-       /*
-        * Handlers for outgoing data so that snoop/capture does not
-        * have to have its hooks in the send path
-        */
-       send_routine process_pio_send;
-       send_routine process_dma_send;
-       void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                               u64 pbc, const void *from, size_t count);
-
-       /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
-       u8 oui1;
-       u8 oui2;
-       u8 oui3;
-       /* Timer and counter used to detect RcvBufOvflCnt changes */
-       struct timer_list rcverr_timer;
-       u32 rcv_ovfl_cnt;
-
-       wait_queue_head_t event_queue;
-
-       /* Save the enabled LCB error bits */
-       u64 lcb_err_en;
-       u8 dc_shutdown;
-
-       /* receive context tail dummy address */
-       __le64 *rcvhdrtail_dummy_kvaddr;
-       dma_addr_t rcvhdrtail_dummy_physaddr;
-
-       bool eprom_available;   /* true if EPROM is available for this device */
-       bool aspm_supported;    /* Does HW support ASPM */
-       bool aspm_enabled;      /* ASPM state: enabled/disabled */
-       /* Serialize ASPM enable/disable between multiple verbs contexts */
-       spinlock_t aspm_lock;
-       /* Number of verbs contexts which have disabled ASPM */
-       atomic_t aspm_disabled_cnt;
-
-       struct hfi1_affinity *affinity;
-};
-
-/* 8051 firmware version helper */
-#define dc8051_ver(a, b) ((a) << 8 | (b))
-
-/* f_put_tid types */
-#define PT_EXPECTED 0
-#define PT_EAGER    1
-#define PT_INVALID  2
-
-struct tid_rb_node;
-struct mmu_rb_node;
-
-/* Private data for file operations */
-struct hfi1_filedata {
-       struct hfi1_ctxtdata *uctxt;
-       unsigned subctxt;
-       struct hfi1_user_sdma_comp_q *cq;
-       struct hfi1_user_sdma_pkt_q *pq;
-       /* for cpu affinity; -1 if none */
-       int rec_cpu_num;
-       u32 tid_n_pinned;
-       struct rb_root tid_rb_root;
-       struct tid_rb_node **entry_to_rb;
-       spinlock_t tid_lock; /* protect tid_[limit,used] counters */
-       u32 tid_limit;
-       u32 tid_used;
-       u32 *invalid_tids;
-       u32 invalid_tid_idx;
-       /* protect invalid_tids array and invalid_tid_idx */
-       spinlock_t invalid_lock;
-};
-
-extern struct list_head hfi1_dev_list;
-extern spinlock_t hfi1_devs_lock;
-struct hfi1_devdata *hfi1_lookup(int unit);
-extern u32 hfi1_cpulist_count;
-extern unsigned long *hfi1_cpulist;
-
-extern unsigned int snoop_drop_send;
-extern unsigned int snoop_force_capture;
-int hfi1_init(struct hfi1_devdata *, int);
-int hfi1_count_units(int *npresentp, int *nupp);
-int hfi1_count_active_units(void);
-
-int hfi1_diag_add(struct hfi1_devdata *);
-void hfi1_diag_remove(struct hfi1_devdata *);
-void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
-
-void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
-
-int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
-int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
-int hfi1_create_ctxts(struct hfi1_devdata *dd);
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int);
-void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
-                        struct hfi1_devdata *, u8, u8);
-void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
-
-int handle_receive_interrupt(struct hfi1_ctxtdata *, int);
-int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
-int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
-void set_all_slowpath(struct hfi1_devdata *dd);
-
-/* receive packet handler dispositions */
-#define RCV_PKT_OK      0x0 /* keep going */
-#define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
-#define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
-
-/* calculate the current RHF address */
-static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
-{
-       return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
-}
-
-int hfi1_reset_device(int);
-
-/* return the driver's idea of the logical OPA port state */
-static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
-{
-       return ppd->lstate; /* use the cached value */
-}
-
-void receive_interrupt_work(struct work_struct *work);
-
-/* extract service channel from header and rhf */
-static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
-{
-       return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
-}
-
-static inline u16 generate_jkey(kuid_t uid)
-{
-       return from_kuid(current_user_ns(), uid) & 0xffff;
-}
-
-/*
- * active_egress_rate
- *
- * returns the active egress rate in units of [10^6 bits/sec]
- */
-static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
-{
-       u16 link_speed = ppd->link_speed_active;
-       u16 link_width = ppd->link_width_active;
-       u32 egress_rate;
-
-       if (link_speed == OPA_LINK_SPEED_25G)
-               egress_rate = 25000;
-       else /* assume OPA_LINK_SPEED_12_5G */
-               egress_rate = 12500;
-
-       switch (link_width) {
-       case OPA_LINK_WIDTH_4X:
-               egress_rate *= 4;
-               break;
-       case OPA_LINK_WIDTH_3X:
-               egress_rate *= 3;
-               break;
-       case OPA_LINK_WIDTH_2X:
-               egress_rate *= 2;
-               break;
-       default:
-               /* assume IB_WIDTH_1X */
-               break;
-       }
-
-       return egress_rate;
-}
-
-/*
- * egress_cycles
- *
- * Returns the number of 'fabric clock cycles' to egress a packet
- * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
- * rate is (approximately) 805 MHz, the units of the returned value
- * are (1/805 MHz).
- */
-static inline u32 egress_cycles(u32 len, u32 rate)
-{
-       u32 cycles;
-
-       /*
-        * cycles is:
-        *
-        *          (length) [bits] / (rate) [bits/sec]
-        *  ---------------------------------------------------
-        *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
-        */
-
-       cycles = len * 8; /* bits */
-       cycles *= 805;
-       cycles /= rate;
-
-       return cycles;
-}
-
-void set_link_ipg(struct hfi1_pportdata *ppd);
-void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
-                 u32 rqpn, u8 svc_type);
-void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
-               u32 pkey, u32 slid, u32 dlid, u8 sc5,
-               const struct ib_grh *old_grh);
-#define PKEY_CHECK_INVALID -1
-int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
-                     u8 sc5, int8_t s_pkey_index);
-
-#define PACKET_EGRESS_TIMEOUT 350
-static inline void pause_for_credit_return(struct hfi1_devdata *dd)
-{
-       /* Pause at least 1us, to ensure chip returns all credits */
-       u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
-
-       udelay(usec ? usec : 1);
-}
-
-/**
- * sc_to_vlt() reverse lookup sc to vl
- * @dd - devdata
- * @sc5 - 5 bit sc
- */
-static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
-{
-       unsigned seq;
-       u8 rval;
-
-       if (sc5 >= OPA_MAX_SCS)
-               return (u8)(0xff);
-
-       do {
-               seq = read_seqbegin(&dd->sc2vl_lock);
-               rval = *(((u8 *)dd->sc2vl) + sc5);
-       } while (read_seqretry(&dd->sc2vl_lock, seq));
-
-       return rval;
-}
-
-#define PKEY_MEMBER_MASK 0x8000
-#define PKEY_LOW_15_MASK 0x7fff
-
-/*
- * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the ingress partition key table), return 0
- * otherwise. Use the matching criteria for ingress partition keys
- * specified in the OPAv1 spec., section 9.10.14.
- */
-static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
-{
-       u16 mkey = pkey & PKEY_LOW_15_MASK;
-       u16 ment = ent & PKEY_LOW_15_MASK;
-
-       if (mkey == ment) {
-               /*
-                * If pkey[15] is clear (limited partition member),
-                * is bit 15 in the corresponding table element
-                * clear (limited member)?
-                */
-               if (!(pkey & PKEY_MEMBER_MASK))
-                       return !!(ent & PKEY_MEMBER_MASK);
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * ingress_pkey_table_search - search the entire pkey table for
- * an entry which matches 'pkey'. return 0 if a match is found,
- * and 1 otherwise.
- */
-static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
-{
-       int i;
-
-       for (i = 0; i < MAX_PKEY_VALUES; i++) {
-               if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-                       return 0;
-       }
-       return 1;
-}
-
-/*
- * ingress_pkey_table_fail - record a failure of ingress pkey validation,
- * i.e., increment port_rcv_constraint_errors for the port, and record
- * the 'error info' for this failure.
- */
-static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
-                                   u16 slid)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-
-       incr_cntr64(&ppd->port_rcv_constraint_errors);
-       if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
-               dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
-               dd->err_info_rcv_constraint.slid = slid;
-               dd->err_info_rcv_constraint.pkey = pkey;
-       }
-}
-
-/*
- * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
- * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
- * is a hint as to the best place in the partition key table to begin
- * searching. This function should not be called on the data path because
- * of performance reasons. On datapath pkey check is expected to be done
- * by HW and rcv_pkey_check function should be called instead.
- */
-static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
-                                    u8 sc5, u8 idx, u16 slid)
-{
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
-               return 0;
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       /* Is the pkey = 0x0, or 0x8000? */
-       if ((pkey & PKEY_LOW_15_MASK) == 0)
-               goto bad;
-
-       /* The most likely matching pkey has index 'idx' */
-       if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
-               return 0;
-
-       /* no match - try the whole table */
-       if (!ingress_pkey_table_search(ppd, pkey))
-               return 0;
-
-bad:
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-/*
- * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
- * otherwise. It only ensures pkey is vlid for QP0. This function
- * should be called on the data path instead of ingress_pkey_check
- * as on data path, pkey check is done by HW (except for QP0).
- */
-static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
-                                u8 sc5, u16 slid)
-{
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
-               return 0;
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       return 0;
-bad:
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-/* MTU handling */
-
-/* MTU enumeration, 256-4k match IB */
-#define OPA_MTU_0     0
-#define OPA_MTU_256   1
-#define OPA_MTU_512   2
-#define OPA_MTU_1024  3
-#define OPA_MTU_2048  4
-#define OPA_MTU_4096  5
-
-u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
-int mtu_to_enum(u32 mtu, int default_if_bad);
-u16 enum_to_mtu(int);
-static inline int valid_ib_mtu(unsigned int mtu)
-{
-       return mtu == 256 || mtu == 512 ||
-               mtu == 1024 || mtu == 2048 ||
-               mtu == 4096;
-}
-
-static inline int valid_opa_max_mtu(unsigned int mtu)
-{
-       return mtu >= 2048 &&
-               (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
-}
-
-int set_mtu(struct hfi1_pportdata *);
-
-int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
-void hfi1_disable_after_error(struct hfi1_devdata *);
-int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
-int hfi1_rcvbuf_validate(u32, u8, u16 *);
-
-int fm_get_table(struct hfi1_pportdata *, int, void *);
-int fm_set_table(struct hfi1_pportdata *, int, void *);
-
-void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
-void reset_link_credits(struct hfi1_devdata *dd);
-void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
-
-int snoop_recv_handler(struct hfi1_packet *packet);
-int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc);
-int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                          u64 pbc);
-void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
-                          u64 pbc, const void *from, size_t count);
-int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
-
-static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
-{
-       return ppd->dd;
-}
-
-static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
-{
-       return container_of(dev, struct hfi1_devdata, verbs_dev);
-}
-
-static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
-{
-       return dd_from_dev(to_idev(ibdev));
-}
-
-static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
-{
-       return container_of(ibp, struct hfi1_pportdata, ibport_data);
-}
-
-static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
-{
-       return container_of(rdi, struct hfi1_ibdev, rdi);
-}
-
-static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
-
-       WARN_ON(pidx >= dd->num_pports);
-       return &dd->pport[pidx].ibport_data;
-}
-
-/*
- * Return the indexed PKEY from the port PKEY table.
- */
-static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u16 ret;
-
-       if (index >= ARRAY_SIZE(ppd->pkeys))
-               ret = 0;
-       else
-               ret = ppd->pkeys[index];
-
-       return ret;
-}
-
-/*
- * Readers of cc_state must call get_cc_state() under rcu_read_lock().
- * Writers of cc_state must call get_cc_state() under cc_state_lock.
- */
-static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
-{
-       return rcu_dereference(ppd->cc_state);
-}
-
-/*
- * values for dd->flags (_device_ related flags)
- */
-#define HFI1_INITTED           0x1    /* chip and driver up and initted */
-#define HFI1_PRESENT           0x2    /* chip accesses can be done */
-#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
-#define HFI1_HAS_SDMA_TIMEOUT  0x8
-#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
-#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
-
-/* IB dword length mask in PBC (lower 11 bits); same for all chips */
-#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
-
-/* ctxt_flag bit offsets */
-               /* context has been setup */
-#define HFI1_CTXT_SETUP_DONE 1
-               /* waiting for a packet to arrive */
-#define HFI1_CTXT_WAITING_RCV   2
-               /* master has not finished initializing */
-#define HFI1_CTXT_MASTER_UNINIT 4
-               /* waiting for an urgent packet to arrive */
-#define HFI1_CTXT_WAITING_URG 5
-
-/* free up any allocated data at closes */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
-                                 const struct pci_device_id *);
-void hfi1_free_devdata(struct hfi1_devdata *);
-void cc_state_reclaim(struct rcu_head *rcu);
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
-
-/* LED beaconing functions */
-void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
-                            unsigned int timeoff);
-void shutdown_led_override(struct hfi1_pportdata *ppd);
-
-#define HFI1_CREDIT_RETURN_RATE (100)
-
-/*
- * The number of words for the KDETH protocol field.  If this is
- * larger then the actual field used, then part of the payload
- * will be in the header.
- *
- * Optimally, we want this sized so that a typical case will
- * use full cache lines.  The typical local KDETH header would
- * be:
- *
- *     Bytes   Field
- *       8     LRH
- *      12     BHT
- *      ??     KDETH
- *       8     RHF
- *     ---
- *      28 + KDETH
- *
- * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
- */
-#define DEFAULT_RCVHDRSIZE 9
-
-/*
- * Maximal header byte count:
- *
- *     Bytes   Field
- *       8     LRH
- *      40     GRH (optional)
- *      12     BTH
- *      ??     KDETH
- *       8     RHF
- *     ---
- *      68 + KDETH
- *
- * We also want to maintain a cache line alignment to assist DMA'ing
- * of the header bytes.  Round up to a good size.
- */
-#define DEFAULT_RCVHDR_ENTSIZE 32
-
-bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
-int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
-void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
-
-static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
-{
-       *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
-}
-
-static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
-{
-       /*
-        * volatile because it's a DMA target from the chip, routine is
-        * inlined, and don't want register caching or reordering.
-        */
-       return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
-}
-
-/*
- * sysfs interface.
- */
-
-extern const char ib_hfi1_version[];
-
-int hfi1_device_create(struct hfi1_devdata *);
-void hfi1_device_remove(struct hfi1_devdata *);
-
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
-                          struct kobject *kobj);
-int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
-void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
-/* Hook for sysfs read of QSFP */
-int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
-
-int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
-void hfi1_pcie_cleanup(struct pci_dev *);
-int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
-                    const struct pci_device_id *);
-void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
-void hfi1_pcie_flr(struct hfi1_devdata *);
-int pcie_speeds(struct hfi1_devdata *);
-void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
-void hfi1_enable_intx(struct pci_dev *);
-void restore_pci_variables(struct hfi1_devdata *dd);
-int do_pcie_gen3_transition(struct hfi1_devdata *dd);
-int parse_platform_config(struct hfi1_devdata *dd);
-int get_platform_config_field(struct hfi1_devdata *dd,
-                             enum platform_config_table_type_encoding
-                             table_type, int table_index, int field_index,
-                             u32 *data, u32 len);
-
-const char *get_unit_name(int unit);
-const char *get_card_name(struct rvt_dev_info *rdi);
-struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
-
-/*
- * Flush write combining store buffers (if present) and perform a write
- * barrier.
- */
-static inline void flush_wc(void)
-{
-       asm volatile("sfence" : : : "memory");
-}
-
-void handle_eflags(struct hfi1_packet *packet);
-int process_receive_ib(struct hfi1_packet *packet);
-int process_receive_bypass(struct hfi1_packet *packet);
-int process_receive_error(struct hfi1_packet *packet);
-int kdeth_process_expected(struct hfi1_packet *packet);
-int kdeth_process_eager(struct hfi1_packet *packet);
-int process_receive_invalid(struct hfi1_packet *packet);
-
-extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
-
-void update_sge(struct rvt_sge_state *ss, u32 length);
-
-/* global module parameter variables */
-extern unsigned int hfi1_max_mtu;
-extern unsigned int hfi1_cu;
-extern unsigned int user_credit_return_threshold;
-extern int num_user_contexts;
-extern unsigned n_krcvqs;
-extern uint krcvqs[];
-extern int krcvqsset;
-extern uint kdeth_qp;
-extern uint loopback;
-extern uint quick_linkup;
-extern uint rcv_intr_timeout;
-extern uint rcv_intr_count;
-extern uint rcv_intr_dynamic;
-extern ushort link_crc_mask;
-
-extern struct mutex hfi1_mutex;
-
-/* Number of seconds before our card status check...  */
-#define STATUS_TIMEOUT 60
-
-#define DRIVER_NAME            "hfi1"
-#define HFI1_USER_MINOR_BASE     0
-#define HFI1_TRACE_MINOR         127
-#define HFI1_DIAGPKT_MINOR       128
-#define HFI1_DIAG_MINOR_BASE     129
-#define HFI1_SNOOP_CAPTURE_BASE  200
-#define HFI1_NMINORS             255
-
-#define PCI_VENDOR_ID_INTEL 0x8086
-#define PCI_DEVICE_ID_INTEL0 0x24f0
-#define PCI_DEVICE_ID_INTEL1 0x24f1
-
-#define HFI1_PKT_USER_SC_INTEGRITY                                         \
-       (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK           \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
-
-#define HFI1_PKT_KERNEL_SC_INTEGRITY                                       \
-       (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
-
-static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
-                                                 u16 ctxt_type)
-{
-       u64 base_sc_integrity =
-       SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
-       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
-       | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
-
-       if (ctxt_type == SC_USER)
-               base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
-       else
-               base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
-
-       if (is_ax(dd))
-               /* turn off send-side job key checks - A0 */
-               return base_sc_integrity &
-                      ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-       return base_sc_integrity;
-}
-
-static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
-{
-       u64 base_sdma_integrity =
-       SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
-       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
-       | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
-
-       if (is_ax(dd))
-               /* turn off send-side job key checks - A0 */
-               return base_sdma_integrity &
-                      ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-       return base_sdma_integrity;
-}
-
-/*
- * hfi1_early_err is used (only!) to print early errors before devdata is
- * allocated, or when dd->pcidev may not be valid, and at the tail end of
- * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
- * the same as dd_dev_err, but is used when the message really needs
- * the IB port# to be definitive as to what's happening..
- */
-#define hfi1_early_err(dev, fmt, ...) \
-       dev_err(dev, fmt, ##__VA_ARGS__)
-
-#define hfi1_early_info(dev, fmt, ...) \
-       dev_info(dev, fmt, ##__VA_ARGS__)
-
-#define dd_dev_emerg(dd, fmt, ...) \
-       dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
-                 get_unit_name((dd)->unit), ##__VA_ARGS__)
-#define dd_dev_err(dd, fmt, ...) \
-       dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-#define dd_dev_warn(dd, fmt, ...) \
-       dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_warn_ratelimited(dd, fmt, ...) \
-       dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_info(dd, fmt, ...) \
-       dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
-                       get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define dd_dev_dbg(dd, fmt, ...) \
-       dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
-               get_unit_name((dd)->unit), ##__VA_ARGS__)
-
-#define hfi1_dev_porterr(dd, port, fmt, ...) \
-       dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
-                       get_unit_name((dd)->unit), (dd)->unit, (port), \
-                       ##__VA_ARGS__)
-
-/*
- * this is used for formatting hw error messages...
- */
-struct hfi1_hwerror_msgs {
-       u64 mask;
-       const char *msg;
-       size_t sz;
-};
-
-/* in intr.c... */
-void hfi1_format_hwerrors(u64 hwerrs,
-                         const struct hfi1_hwerror_msgs *hwerrmsgs,
-                         size_t nhwerrmsgs, char *msg, size_t lmsg);
-
-#define USER_OPCODE_CHECK_VAL 0xC0
-#define USER_OPCODE_CHECK_MASK 0xC0
-#define OPCODE_CHECK_VAL_DISABLED 0x0
-#define OPCODE_CHECK_MASK_DISABLED 0x0
-
-static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       dd->z_int_counter = get_all_cpu_total(dd->int_counter);
-       dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
-       dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
-
-       ppd = (struct hfi1_pportdata *)(dd + 1);
-       for (i = 0; i < dd->num_pports; i++, ppd++) {
-               ppd->ibport_data.rvp.z_rc_acks =
-                       get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
-               ppd->ibport_data.rvp.z_rc_qacks =
-                       get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
-       }
-}
-
-/* Control LED state */
-static inline void setextled(struct hfi1_devdata *dd, u32 on)
-{
-       if (on)
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
-       else
-               write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
-}
-
-/* return the i2c resource given the target */
-static inline u32 i2c_target(u32 target)
-{
-       return target ? CR_I2C2 : CR_I2C1;
-}
-
-/* return the i2c chain chip resource that this HFI uses for QSFP */
-static inline u32 qsfp_resource(struct hfi1_devdata *dd)
-{
-       return i2c_target(dd->hfi1_id);
-}
-
-int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
-
-#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
deleted file mode 100644 (file)
index 502b7cf..0000000
+++ /dev/null
@@ -1,1806 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/delay.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/hrtimer.h>
-#include <rdma/rdma_vt.h>
-
-#include "hfi.h"
-#include "device.h"
-#include "common.h"
-#include "trace.h"
-#include "mad.h"
-#include "sdma.h"
-#include "debugfs.h"
-#include "verbs.h"
-#include "aspm.h"
-
-#undef pr_fmt
-#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-
-/*
- * min buffers we want to have per context, after driver
- */
-#define HFI1_MIN_USER_CTXT_BUFCNT 7
-
-#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
-#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
-#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
-#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
-
-/*
- * Number of user receive contexts we are configured to use (to allow for more
- * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
- */
-int num_user_contexts = -1;
-module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO);
-MODULE_PARM_DESC(
-       num_user_contexts, "Set max number of user contexts to use");
-
-uint krcvqs[RXE_NUM_DATA_VL];
-int krcvqsset;
-module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
-MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
-
-/* computed based on above array */
-unsigned n_krcvqs;
-
-static unsigned hfi1_rcvarr_split = 25;
-module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
-MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
-
-static uint eager_buffer_size = (2 << 20); /* 2MB */
-module_param(eager_buffer_size, uint, S_IRUGO);
-MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
-
-static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
-module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
-MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
-
-static uint hfi1_hdrq_entsize = 32;
-module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
-MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
-
-unsigned int user_credit_return_threshold = 33;        /* default is 33% */
-module_param(user_credit_return_threshold, uint, S_IRUGO);
-MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
-
-static inline u64 encode_rcv_header_entry_size(u16);
-
-static struct idr hfi1_unit_table;
-u32 hfi1_cpulist_count;
-unsigned long *hfi1_cpulist;
-
-/*
- * Common code for creating the receive context array.
- */
-int hfi1_create_ctxts(struct hfi1_devdata *dd)
-{
-       unsigned i;
-       int ret;
-
-       /* Control context has to be always 0 */
-       BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
-
-       dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
-                              GFP_KERNEL, dd->node);
-       if (!dd->rcd)
-               goto nomem;
-
-       /* create one or more kernel contexts */
-       for (i = 0; i < dd->first_user_ctxt; ++i) {
-               struct hfi1_pportdata *ppd;
-               struct hfi1_ctxtdata *rcd;
-
-               ppd = dd->pport + (i % dd->num_pports);
-               rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
-               if (!rcd) {
-                       dd_dev_err(dd,
-                                  "Unable to allocate kernel receive context, failing\n");
-                       goto nomem;
-               }
-               /*
-                * Set up the kernel context flags here and now because they
-                * use default values for all receive side memories.  User
-                * contexts will be handled as they are created.
-                */
-               rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
-                       HFI1_CAP_KGET(NODROP_RHQ_FULL) |
-                       HFI1_CAP_KGET(NODROP_EGR_FULL) |
-                       HFI1_CAP_KGET(DMA_RTAIL);
-
-               /* Control context must use DMA_RTAIL */
-               if (rcd->ctxt == HFI1_CTRL_CTXT)
-                       rcd->flags |= HFI1_CAP_DMA_RTAIL;
-               rcd->seq_cnt = 1;
-
-               rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
-               if (!rcd->sc) {
-                       dd_dev_err(dd,
-                                  "Unable to allocate kernel send context, failing\n");
-                       dd->rcd[rcd->ctxt] = NULL;
-                       hfi1_free_ctxtdata(dd, rcd);
-                       goto nomem;
-               }
-
-               ret = hfi1_init_ctxt(rcd->sc);
-               if (ret < 0) {
-                       dd_dev_err(dd,
-                                  "Failed to setup kernel receive context, failing\n");
-                       sc_free(rcd->sc);
-                       dd->rcd[rcd->ctxt] = NULL;
-                       hfi1_free_ctxtdata(dd, rcd);
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       }
-
-       /*
-        * Initialize aspm, to be done after gen3 transition and setting up
-        * contexts and before enabling interrupts
-        */
-       aspm_init(dd);
-
-       return 0;
-nomem:
-       ret = -ENOMEM;
-bail:
-       kfree(dd->rcd);
-       dd->rcd = NULL;
-       return ret;
-}
-
-/*
- * Common code for user and kernel context setup.
- */
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
-                                          int numa)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct hfi1_ctxtdata *rcd;
-       unsigned kctxt_ngroups = 0;
-       u32 base;
-
-       if (dd->rcv_entries.nctxt_extra >
-           dd->num_rcv_contexts - dd->first_user_ctxt)
-               kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
-                                (dd->num_rcv_contexts - dd->first_user_ctxt));
-       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
-       if (rcd) {
-               u32 rcvtids, max_entries;
-
-               hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
-
-               INIT_LIST_HEAD(&rcd->qp_wait_list);
-               rcd->ppd = ppd;
-               rcd->dd = dd;
-               rcd->cnt = 1;
-               rcd->ctxt = ctxt;
-               dd->rcd[ctxt] = rcd;
-               rcd->numa_id = numa;
-               rcd->rcv_array_groups = dd->rcv_entries.ngroups;
-
-               mutex_init(&rcd->exp_lock);
-
-               /*
-                * Calculate the context's RcvArray entry starting point.
-                * We do this here because we have to take into account all
-                * the RcvArray entries that previous context would have
-                * taken and we have to account for any extra groups
-                * assigned to the kernel or user contexts.
-                */
-               if (ctxt < dd->first_user_ctxt) {
-                       if (ctxt < kctxt_ngroups) {
-                               base = ctxt * (dd->rcv_entries.ngroups + 1);
-                               rcd->rcv_array_groups++;
-                       } else
-                               base = kctxt_ngroups +
-                                       (ctxt * dd->rcv_entries.ngroups);
-               } else {
-                       u16 ct = ctxt - dd->first_user_ctxt;
-
-                       base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
-                               kctxt_ngroups);
-                       if (ct < dd->rcv_entries.nctxt_extra) {
-                               base += ct * (dd->rcv_entries.ngroups + 1);
-                               rcd->rcv_array_groups++;
-                       } else
-                               base += dd->rcv_entries.nctxt_extra +
-                                       (ct * dd->rcv_entries.ngroups);
-               }
-               rcd->eager_base = base * dd->rcv_entries.group_size;
-
-               /* Validate and initialize Rcv Hdr Q variables */
-               if (rcvhdrcnt % HDRQ_INCREMENT) {
-                       dd_dev_err(dd,
-                                  "ctxt%u: header queue count %d must be divisible by %lu\n",
-                                  rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
-                       goto bail;
-               }
-               rcd->rcvhdrq_cnt = rcvhdrcnt;
-               rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
-               /*
-                * Simple Eager buffer allocation: we have already pre-allocated
-                * the number of RcvArray entry groups. Each ctxtdata structure
-                * holds the number of groups for that context.
-                *
-                * To follow CSR requirements and maintain cacheline alignment,
-                * make sure all sizes and bases are multiples of group_size.
-                *
-                * The expected entry count is what is left after assigning
-                * eager.
-                */
-               max_entries = rcd->rcv_array_groups *
-                       dd->rcv_entries.group_size;
-               rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
-               rcd->egrbufs.count = round_down(rcvtids,
-                                               dd->rcv_entries.group_size);
-               if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
-                       dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
-                                  rcd->ctxt);
-                       rcd->egrbufs.count = MAX_EAGER_ENTRIES;
-               }
-               hfi1_cdbg(PROC,
-                         "ctxt%u: max Eager buffer RcvArray entries: %u\n",
-                         rcd->ctxt, rcd->egrbufs.count);
-
-               /*
-                * Allocate array that will hold the eager buffer accounting
-                * data.
-                * This will allocate the maximum possible buffer count based
-                * on the value of the RcvArray split parameter.
-                * The resulting value will be rounded down to the closest
-                * multiple of dd->rcv_entries.group_size.
-                */
-               rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
-                                              sizeof(*rcd->egrbufs.buffers),
-                                              GFP_KERNEL);
-               if (!rcd->egrbufs.buffers)
-                       goto bail;
-               rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
-                                              sizeof(*rcd->egrbufs.rcvtids),
-                                              GFP_KERNEL);
-               if (!rcd->egrbufs.rcvtids)
-                       goto bail;
-               rcd->egrbufs.size = eager_buffer_size;
-               /*
-                * The size of the buffers programmed into the RcvArray
-                * entries needs to be big enough to handle the highest
-                * MTU supported.
-                */
-               if (rcd->egrbufs.size < hfi1_max_mtu) {
-                       rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
-                       hfi1_cdbg(PROC,
-                                 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
-                                   rcd->ctxt, rcd->egrbufs.size);
-               }
-               rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
-
-               if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
-                       rcd->opstats = kzalloc(sizeof(*rcd->opstats),
-                               GFP_KERNEL);
-                       if (!rcd->opstats)
-                               goto bail;
-               }
-       }
-       return rcd;
-bail:
-       kfree(rcd->egrbufs.rcvtids);
-       kfree(rcd->egrbufs.buffers);
-       kfree(rcd);
-       return NULL;
-}
-
-/*
- * Convert a receive header entry size that to the encoding used in the CSR.
- *
- * Return a zero if the given size is invalid.
- */
-static inline u64 encode_rcv_header_entry_size(u16 size)
-{
-       /* there are only 3 valid receive header entry sizes */
-       if (size == 2)
-               return 1;
-       if (size == 16)
-               return 2;
-       else if (size == 32)
-               return 4;
-       return 0; /* invalid */
-}
-
-/*
- * Select the largest ccti value over all SLs to determine the intra-
- * packet gap for the link.
- *
- * called with cca_timer_lock held (to protect access to cca_timer
- * array), and rcu_read_lock() (to protect access to cc_state).
- */
-void set_link_ipg(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       struct cc_state *cc_state;
-       int i;
-       u16 cce, ccti_limit, max_ccti = 0;
-       u16 shift, mult;
-       u64 src;
-       u32 current_egress_rate; /* Mbits /sec */
-       u32 max_pkt_time;
-       /*
-        * max_pkt_time is the maximum packet egress time in units
-        * of the fabric clock period 1/(805 MHz).
-        */
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state)
-               /*
-                * This should _never_ happen - rcu_read_lock() is held,
-                * and set_link_ipg() should not be called if cc_state
-                * is NULL.
-                */
-               return;
-
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               u16 ccti = ppd->cca_timer[i].ccti;
-
-               if (ccti > max_ccti)
-                       max_ccti = ccti;
-       }
-
-       ccti_limit = cc_state->cct.ccti_limit;
-       if (max_ccti > ccti_limit)
-               max_ccti = ccti_limit;
-
-       cce = cc_state->cct.entries[max_ccti].entry;
-       shift = (cce & 0xc000) >> 14;
-       mult = (cce & 0x3fff);
-
-       current_egress_rate = active_egress_rate(ppd);
-
-       max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
-
-       src = (max_pkt_time >> shift) * mult;
-
-       src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
-       src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
-
-       write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
-}
-
-static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
-{
-       struct cca_timer *cca_timer;
-       struct hfi1_pportdata *ppd;
-       int sl;
-       u16 ccti_timer, ccti_min;
-       struct cc_state *cc_state;
-       unsigned long flags;
-       enum hrtimer_restart ret = HRTIMER_NORESTART;
-
-       cca_timer = container_of(t, struct cca_timer, hrtimer);
-       ppd = cca_timer->ppd;
-       sl = cca_timer->sl;
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return HRTIMER_NORESTART;
-       }
-
-       /*
-        * 1) decrement ccti for SL
-        * 2) calculate IPG for link (set_link_ipg())
-        * 3) restart timer, unless ccti is at min value
-        */
-
-       ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
-       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
-
-       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
-
-       if (cca_timer->ccti > ccti_min) {
-               cca_timer->ccti--;
-               set_link_ipg(ppd);
-       }
-
-       if (cca_timer->ccti > ccti_min) {
-               unsigned long nsec = 1024 * ccti_timer;
-               /* ccti_timer is in units of 1.024 usec */
-               hrtimer_forward_now(t, ns_to_ktime(nsec));
-               ret = HRTIMER_RESTART;
-       }
-
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-       rcu_read_unlock();
-       return ret;
-}
-
-/*
- * Common code for initializing the physical port structure.
- */
-void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
-                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
-{
-       int i, size;
-       uint default_pkey_idx;
-
-       ppd->dd = dd;
-       ppd->hw_pidx = hw_pidx;
-       ppd->port = port; /* IB port number, not index */
-
-       default_pkey_idx = 1;
-
-       ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
-       if (loopback) {
-               hfi1_early_err(&pdev->dev,
-                              "Faking data partition 0x8001 in idx %u\n",
-                              !default_pkey_idx);
-               ppd->pkeys[!default_pkey_idx] = 0x8001;
-       }
-
-       INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
-       INIT_WORK(&ppd->link_up_work, handle_link_up);
-       INIT_WORK(&ppd->link_down_work, handle_link_down);
-       INIT_WORK(&ppd->freeze_work, handle_freeze);
-       INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
-       INIT_WORK(&ppd->sma_message_work, handle_sma_message);
-       INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
-       INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
-       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
-
-       mutex_init(&ppd->hls_lock);
-       spin_lock_init(&ppd->sdma_alllock);
-       spin_lock_init(&ppd->qsfp_info.qsfp_lock);
-
-       ppd->qsfp_info.ppd = ppd;
-       ppd->sm_trap_qp = 0x0;
-       ppd->sa_qp = 0x1;
-
-       ppd->hfi1_wq = NULL;
-
-       spin_lock_init(&ppd->cca_timer_lock);
-
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
-                            HRTIMER_MODE_REL);
-               ppd->cca_timer[i].ppd = ppd;
-               ppd->cca_timer[i].sl = i;
-               ppd->cca_timer[i].ccti = 0;
-               ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
-       }
-
-       ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
-
-       spin_lock_init(&ppd->cc_state_lock);
-       spin_lock_init(&ppd->cc_log_lock);
-       size = sizeof(struct cc_state);
-       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
-       if (!rcu_dereference(ppd->cc_state))
-               goto bail;
-       return;
-
-bail:
-
-       hfi1_early_err(&pdev->dev,
-                      "Congestion Control Agent disabled for port %d\n", port);
-}
-
-/*
- * Do initialization for device that is only needed on
- * first detect, not on resets.
- */
-static int loadtime_init(struct hfi1_devdata *dd)
-{
-       return 0;
-}
-
-/**
- * init_after_reset - re-initialize after a reset
- * @dd: the hfi1_ib device
- *
- * sanity check at least some of the values after reset, and
- * ensure no receive or transmit (explicitly, in case reset
- * failed
- */
-static int init_after_reset(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /*
-        * Ensure chip does no sends or receives, tail updates, or
-        * pioavail updates while we re-initialize.  This is mostly
-        * for the driver data structures, not chip registers.
-        */
-       for (i = 0; i < dd->num_rcv_contexts; i++)
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-                                 HFI1_RCVCTRL_INTRAVAIL_DIS |
-                                 HFI1_RCVCTRL_TAILUPD_DIS, i);
-       pio_send_control(dd, PSC_GLOBAL_DISABLE);
-       for (i = 0; i < dd->num_send_contexts; i++)
-               sc_disable(dd->send_contexts[i].sc);
-
-       return 0;
-}
-
-static void enable_chip(struct hfi1_devdata *dd)
-{
-       u32 rcvmask;
-       u32 i;
-
-       /* enable PIO send */
-       pio_send_control(dd, PSC_GLOBAL_ENABLE);
-
-       /*
-        * Enable kernel ctxts' receive and receive interrupt.
-        * Other ctxts done as user opens and initializes them.
-        */
-       for (i = 0; i < dd->first_user_ctxt; ++i) {
-               rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
-               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
-                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
-               if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
-                       rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
-                       rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
-               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
-                       rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-               hfi1_rcvctrl(dd, rcvmask, i);
-               sc_enable(dd->rcd[i]->sc);
-       }
-}
-
-/**
- * create_workqueues - create per port workqueues
- * @dd: the hfi1_ib device
- */
-static int create_workqueues(struct hfi1_devdata *dd)
-{
-       int pidx;
-       struct hfi1_pportdata *ppd;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (!ppd->hfi1_wq) {
-                       ppd->hfi1_wq =
-                               alloc_workqueue(
-                                   "hfi%d_%d",
-                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
-                                   dd->num_sdma,
-                                   dd->unit, pidx);
-                       if (!ppd->hfi1_wq)
-                               goto wq_error;
-               }
-       }
-       return 0;
-wq_error:
-       pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (ppd->hfi1_wq) {
-                       destroy_workqueue(ppd->hfi1_wq);
-                       ppd->hfi1_wq = NULL;
-               }
-       }
-       return -ENOMEM;
-}
-
-/**
- * hfi1_init - do the actual initialization sequence on the chip
- * @dd: the hfi1_ib device
- * @reinit: re-initializing, so don't allocate new memory
- *
- * Do the actual initialization sequence on the chip.  This is done
- * both from the init routine called from the PCI infrastructure, and
- * when we reset the chip, or detect that it was reset internally,
- * or it's administratively re-enabled.
- *
- * Memory allocation here and in called routines is only done in
- * the first case (reinit == 0).  We have to be careful, because even
- * without memory allocation, we need to re-write all the chip registers
- * TIDs, etc. after the reset or enable has completed.
- */
-int hfi1_init(struct hfi1_devdata *dd, int reinit)
-{
-       int ret = 0, pidx, lastfail = 0;
-       unsigned i, len;
-       struct hfi1_ctxtdata *rcd;
-       struct hfi1_pportdata *ppd;
-
-       /* Set up recv low level handlers */
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
-                                               kdeth_process_expected;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
-                                               kdeth_process_eager;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
-                                               process_receive_error;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
-                                               process_receive_bypass;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
-                                               process_receive_invalid;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
-                                               process_receive_invalid;
-       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
-                                               process_receive_invalid;
-       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
-
-       /* Set up send low level handlers */
-       dd->process_pio_send = hfi1_verbs_send_pio;
-       dd->process_dma_send = hfi1_verbs_send_dma;
-       dd->pio_inline_send = pio_copy;
-
-       if (is_ax(dd)) {
-               atomic_set(&dd->drop_packet, DROP_PACKET_ON);
-               dd->do_drop = 1;
-       } else {
-               atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
-               dd->do_drop = 0;
-       }
-
-       /* make sure the link is not "up" */
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               ppd->linkup = 0;
-       }
-
-       if (reinit)
-               ret = init_after_reset(dd);
-       else
-               ret = loadtime_init(dd);
-       if (ret)
-               goto done;
-
-       /* allocate dummy tail memory for all receive contexts */
-       dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
-               &dd->pcidev->dev, sizeof(u64),
-               &dd->rcvhdrtail_dummy_physaddr,
-               GFP_KERNEL);
-
-       if (!dd->rcvhdrtail_dummy_kvaddr) {
-               dd_dev_err(dd, "cannot allocate dummy tail memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       /* dd->rcd can be NULL if early initialization failed */
-       for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
-               /*
-                * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
-                * re-init, the simplest way to handle this is to free
-                * existing, and re-allocate.
-                * Need to re-create rest of ctxt 0 ctxtdata as well.
-                */
-               rcd = dd->rcd[i];
-               if (!rcd)
-                       continue;
-
-               rcd->do_interrupt = &handle_receive_interrupt;
-
-               lastfail = hfi1_create_rcvhdrq(dd, rcd);
-               if (!lastfail)
-                       lastfail = hfi1_setup_eagerbufs(rcd);
-               if (lastfail)
-                       dd_dev_err(dd,
-                                  "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
-       }
-       if (lastfail)
-               ret = lastfail;
-
-       /* Allocate enough memory for user event notification. */
-       len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
-                        sizeof(*dd->events));
-       dd->events = vmalloc_user(len);
-       if (!dd->events)
-               dd_dev_err(dd, "Failed to allocate user events page\n");
-       /*
-        * Allocate a page for device and port status.
-        * Page will be shared amongst all user processes.
-        */
-       dd->status = vmalloc_user(PAGE_SIZE);
-       if (!dd->status)
-               dd_dev_err(dd, "Failed to allocate dev status page\n");
-       else
-               dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
-                                            sizeof(dd->status->freezemsg));
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (dd->status)
-                       /* Currently, we only have one port */
-                       ppd->statusp = &dd->status->port;
-
-               set_mtu(ppd);
-       }
-
-       /* enable chip even if we have an error, so we can debug cause */
-       enable_chip(dd);
-
-done:
-       /*
-        * Set status even if port serdes is not initialized
-        * so that diags will work.
-        */
-       if (dd->status)
-               dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
-                       HFI1_STATUS_INITTED;
-       if (!ret) {
-               /* enable all interrupts from the chip */
-               set_intr_state(dd, 1);
-
-               /* chip is OK for user apps; mark it as initialized */
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       ppd = dd->pport + pidx;
-
-                       /*
-                        * start the serdes - must be after interrupts are
-                        * enabled so we are notified when the link goes up
-                        */
-                       lastfail = bringup_serdes(ppd);
-                       if (lastfail)
-                               dd_dev_info(dd,
-                                           "Failed to bring up port %u\n",
-                                           ppd->port);
-
-                       /*
-                        * Set status even if port serdes is not initialized
-                        * so that diags will work.
-                        */
-                       if (ppd->statusp)
-                               *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
-                                                       HFI1_STATUS_INITTED;
-                       if (!ppd->link_speed_enabled)
-                               continue;
-               }
-       }
-
-       /* if ret is non-zero, we probably should do some cleanup here... */
-       return ret;
-}
-
-static inline struct hfi1_devdata *__hfi1_lookup(int unit)
-{
-       return idr_find(&hfi1_unit_table, unit);
-}
-
-struct hfi1_devdata *hfi1_lookup(int unit)
-{
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       dd = __hfi1_lookup(unit);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-       return dd;
-}
-
-/*
- * Stop the timers during unit shutdown, or after an error late
- * in initialization.
- */
-static void stop_timers(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int pidx;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               if (ppd->led_override_timer.data) {
-                       del_timer_sync(&ppd->led_override_timer);
-                       atomic_set(&ppd->led_override_timer_active, 0);
-               }
-       }
-}
-
-/**
- * shutdown_device - shut down a device
- * @dd: the hfi1_ib device
- *
- * This is called to make the device quiet when we are about to
- * unload the driver, and also when the device is administratively
- * disabled.   It does not free any data structures.
- * Everything it does has to be setup again by hfi1_init(dd, 1)
- */
-static void shutdown_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       unsigned pidx;
-       int i;
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               ppd->linkup = 0;
-               if (ppd->statusp)
-                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
-                                          HFI1_STATUS_IB_READY);
-       }
-       dd->flags &= ~HFI1_INITTED;
-
-       /* mask interrupts, but not errors */
-       set_intr_state(dd, 0);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-               for (i = 0; i < dd->num_rcv_contexts; i++)
-                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
-                                         HFI1_RCVCTRL_CTXT_DIS |
-                                         HFI1_RCVCTRL_INTRAVAIL_DIS |
-                                         HFI1_RCVCTRL_PKEY_DIS |
-                                         HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
-               /*
-                * Gracefully stop all sends allowing any in progress to
-                * trickle out first.
-                */
-               for (i = 0; i < dd->num_send_contexts; i++)
-                       sc_flush(dd->send_contexts[i].sc);
-       }
-
-       /*
-        * Enough for anything that's going to trickle out to have actually
-        * done so.
-        */
-       udelay(20);
-
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               ppd = dd->pport + pidx;
-
-               /* disable all contexts */
-               for (i = 0; i < dd->num_send_contexts; i++)
-                       sc_disable(dd->send_contexts[i].sc);
-               /* disable the send device */
-               pio_send_control(dd, PSC_GLOBAL_DISABLE);
-
-               shutdown_led_override(ppd);
-
-               /*
-                * Clear SerdesEnable.
-                * We can't count on interrupts since we are stopping.
-                */
-               hfi1_quiet_serdes(ppd);
-
-               if (ppd->hfi1_wq) {
-                       destroy_workqueue(ppd->hfi1_wq);
-                       ppd->hfi1_wq = NULL;
-               }
-       }
-       sdma_exit(dd);
-}
-
-/**
- * hfi1_free_ctxtdata - free a context's allocated data
- * @dd: the hfi1_ib device
- * @rcd: the ctxtdata structure
- *
- * free up any allocated data for a context
- * This should not touch anything that would affect a simultaneous
- * re-allocation of context data, because it is called after hfi1_mutex
- * is released (and can be called from reinit as well).
- * It should never change any chip state, or global driver state.
- */
-void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
-{
-       unsigned e;
-
-       if (!rcd)
-               return;
-
-       if (rcd->rcvhdrq) {
-               dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
-                                 rcd->rcvhdrq, rcd->rcvhdrq_phys);
-               rcd->rcvhdrq = NULL;
-               if (rcd->rcvhdrtail_kvaddr) {
-                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                         (void *)rcd->rcvhdrtail_kvaddr,
-                                         rcd->rcvhdrqtailaddr_phys);
-                       rcd->rcvhdrtail_kvaddr = NULL;
-               }
-       }
-
-       /* all the RcvArray entries should have been cleared by now */
-       kfree(rcd->egrbufs.rcvtids);
-
-       for (e = 0; e < rcd->egrbufs.alloced; e++) {
-               if (rcd->egrbufs.buffers[e].phys)
-                       dma_free_coherent(&dd->pcidev->dev,
-                                         rcd->egrbufs.buffers[e].len,
-                                         rcd->egrbufs.buffers[e].addr,
-                                         rcd->egrbufs.buffers[e].phys);
-       }
-       kfree(rcd->egrbufs.buffers);
-
-       sc_free(rcd->sc);
-       vfree(rcd->user_event_mask);
-       vfree(rcd->subctxt_uregbase);
-       vfree(rcd->subctxt_rcvegrbuf);
-       vfree(rcd->subctxt_rcvhdr_base);
-       kfree(rcd->opstats);
-       kfree(rcd);
-}
-
-/*
- * Release our hold on the shared asic data.  If we are the last one,
- * free the structure.  Must be holding hfi1_devs_lock.
- */
-static void release_asic_data(struct hfi1_devdata *dd)
-{
-       int other;
-
-       if (!dd->asic_data)
-               return;
-       dd->asic_data->dds[dd->hfi1_id] = NULL;
-       other = dd->hfi1_id ? 0 : 1;
-       if (!dd->asic_data->dds[other]) {
-               /* we are the last holder, free it */
-               kfree(dd->asic_data);
-       }
-       dd->asic_data = NULL;
-}
-
-void hfi1_free_devdata(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       idr_remove(&hfi1_unit_table, dd->unit);
-       list_del(&dd->list);
-       release_asic_data(dd);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       free_platform_config(dd);
-       rcu_barrier(); /* wait for rcu callbacks to complete */
-       free_percpu(dd->int_counter);
-       free_percpu(dd->rcv_limit);
-       hfi1_dev_affinity_free(dd);
-       free_percpu(dd->send_schedule);
-       rvt_dealloc_device(&dd->verbs_dev.rdi);
-}
-
-/*
- * Allocate our primary per-unit data structure.  Must be done via verbs
- * allocator, because the verbs cleanup process both does cleanup and
- * free of the data structure.
- * "extra" is for chip-specific data.
- *
- * Use the idr mechanism to get a unit number for this unit.
- */
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
-{
-       unsigned long flags;
-       struct hfi1_devdata *dd;
-       int ret, nports;
-
-       /* extra is * number of ports */
-       nports = extra / sizeof(struct hfi1_pportdata);
-
-       dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
-                                                    nports);
-       if (!dd)
-               return ERR_PTR(-ENOMEM);
-       dd->num_pports = nports;
-       dd->pport = (struct hfi1_pportdata *)(dd + 1);
-
-       INIT_LIST_HEAD(&dd->list);
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret >= 0) {
-               dd->unit = ret;
-               list_add(&dd->list, &hfi1_dev_list);
-       }
-
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       idr_preload_end();
-
-       if (ret < 0) {
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate unit ID: error %d\n", -ret);
-               goto bail;
-       }
-       /*
-        * Initialize all locks for the device. This needs to be as early as
-        * possible so locks are usable.
-        */
-       spin_lock_init(&dd->sc_lock);
-       spin_lock_init(&dd->sendctrl_lock);
-       spin_lock_init(&dd->rcvctrl_lock);
-       spin_lock_init(&dd->uctxt_lock);
-       spin_lock_init(&dd->hfi1_diag_trans_lock);
-       spin_lock_init(&dd->sc_init_lock);
-       spin_lock_init(&dd->dc8051_lock);
-       spin_lock_init(&dd->dc8051_memlock);
-       seqlock_init(&dd->sc2vl_lock);
-       spin_lock_init(&dd->sde_map_lock);
-       spin_lock_init(&dd->pio_map_lock);
-       init_waitqueue_head(&dd->event_queue);
-
-       dd->int_counter = alloc_percpu(u64);
-       if (!dd->int_counter) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu int_counter\n");
-               goto bail;
-       }
-
-       dd->rcv_limit = alloc_percpu(u64);
-       if (!dd->rcv_limit) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu rcv_limit\n");
-               goto bail;
-       }
-
-       dd->send_schedule = alloc_percpu(u64);
-       if (!dd->send_schedule) {
-               ret = -ENOMEM;
-               hfi1_early_err(&pdev->dev,
-                              "Could not allocate per-cpu int_counter\n");
-               goto bail;
-       }
-
-       if (!hfi1_cpulist_count) {
-               u32 count = num_online_cpus();
-
-               hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
-                                      GFP_KERNEL);
-               if (hfi1_cpulist)
-                       hfi1_cpulist_count = count;
-               else
-                       hfi1_early_err(
-                       &pdev->dev,
-                       "Could not alloc cpulist info, cpu affinity might be wrong\n");
-       }
-       return dd;
-
-bail:
-       if (!list_empty(&dd->list))
-               list_del_init(&dd->list);
-       rvt_dealloc_device(&dd->verbs_dev.rdi);
-       return ERR_PTR(ret);
-}
-
-/*
- * Called from freeze mode handlers, and from PCI error
- * reporting code.  Should be paranoid about state of
- * system and data structures.
- */
-void hfi1_disable_after_error(struct hfi1_devdata *dd)
-{
-       if (dd->flags & HFI1_INITTED) {
-               u32 pidx;
-
-               dd->flags &= ~HFI1_INITTED;
-               if (dd->pport)
-                       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                               struct hfi1_pportdata *ppd;
-
-                               ppd = dd->pport + pidx;
-                               if (dd->flags & HFI1_PRESENT)
-                                       set_link_state(ppd, HLS_DN_DISABLE);
-
-                               if (ppd->statusp)
-                                       *ppd->statusp &= ~HFI1_STATUS_IB_READY;
-                       }
-       }
-
-       /*
-        * Mark as having had an error for driver, and also
-        * for /sys and status word mapped to user programs.
-        * This marks unit as not usable, until reset.
-        */
-       if (dd->status)
-               dd->status->dev |= HFI1_STATUS_HWERROR;
-}
-
-static void remove_one(struct pci_dev *);
-static int init_one(struct pci_dev *, const struct pci_device_id *);
-
-#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
-#define PFX DRIVER_NAME ": "
-
-static const struct pci_device_id hfi1_pci_tbl[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
-       { 0, }
-};
-
-MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
-
-static struct pci_driver hfi1_pci_driver = {
-       .name = DRIVER_NAME,
-       .probe = init_one,
-       .remove = remove_one,
-       .id_table = hfi1_pci_tbl,
-       .err_handler = &hfi1_pci_err_handler,
-};
-
-static void __init compute_krcvqs(void)
-{
-       int i;
-
-       for (i = 0; i < krcvqsset; i++)
-               n_krcvqs += krcvqs[i];
-}
-
-/*
- * Do all the generic driver unit- and chip-independent memory
- * allocation and initialization.
- */
-static int __init hfi1_mod_init(void)
-{
-       int ret;
-
-       ret = dev_init();
-       if (ret)
-               goto bail;
-
-       /* validate max MTU before any devices start */
-       if (!valid_opa_max_mtu(hfi1_max_mtu)) {
-               pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
-                      hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
-               hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
-       }
-       /* valid CUs run from 1-128 in powers of 2 */
-       if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
-               hfi1_cu = 1;
-       /* valid credit return threshold is 0-100, variable is unsigned */
-       if (user_credit_return_threshold > 100)
-               user_credit_return_threshold = 100;
-
-       compute_krcvqs();
-       /*
-        * sanitize receive interrupt count, time must wait until after
-        * the hardware type is known
-        */
-       if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
-               rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
-       /* reject invalid combinations */
-       if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
-               pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
-               rcv_intr_count = 1;
-       }
-       if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
-               /*
-                * Avoid indefinite packet delivery by requiring a timeout
-                * if count is > 1.
-                */
-               pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
-               rcv_intr_timeout = 1;
-       }
-       if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
-               /*
-                * The dynamic algorithm expects a non-zero timeout
-                * and a count > 1.
-                */
-               pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
-               rcv_intr_dynamic = 0;
-       }
-
-       /* sanitize link CRC options */
-       link_crc_mask &= SUPPORTED_CRCS;
-
-       /*
-        * These must be called before the driver is registered with
-        * the PCI subsystem.
-        */
-       idr_init(&hfi1_unit_table);
-
-       hfi1_dbg_init();
-       ret = hfi1_wss_init();
-       if (ret < 0)
-               goto bail_wss;
-       ret = pci_register_driver(&hfi1_pci_driver);
-       if (ret < 0) {
-               pr_err("Unable to register driver: error %d\n", -ret);
-               goto bail_dev;
-       }
-       goto bail; /* all OK */
-
-bail_dev:
-       hfi1_wss_exit();
-bail_wss:
-       hfi1_dbg_exit();
-       idr_destroy(&hfi1_unit_table);
-       dev_cleanup();
-bail:
-       return ret;
-}
-
-module_init(hfi1_mod_init);
-
-/*
- * Do the non-unit driver cleanup, memory free, etc. at unload.
- */
-static void __exit hfi1_mod_cleanup(void)
-{
-       pci_unregister_driver(&hfi1_pci_driver);
-       hfi1_wss_exit();
-       hfi1_dbg_exit();
-       hfi1_cpulist_count = 0;
-       kfree(hfi1_cpulist);
-
-       idr_destroy(&hfi1_unit_table);
-       dispose_firmware();     /* asymmetric with obtain_firmware() */
-       dev_cleanup();
-}
-
-module_exit(hfi1_mod_cleanup);
-
-/* this can only be called after a successful initialization */
-static void cleanup_device_data(struct hfi1_devdata *dd)
-{
-       int ctxt;
-       int pidx;
-       struct hfi1_ctxtdata **tmp;
-       unsigned long flags;
-
-       /* users can't do anything more with chip */
-       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-               struct hfi1_pportdata *ppd = &dd->pport[pidx];
-               struct cc_state *cc_state;
-               int i;
-
-               if (ppd->statusp)
-                       *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
-
-               for (i = 0; i < OPA_MAX_SLS; i++)
-                       hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
-
-               spin_lock(&ppd->cc_state_lock);
-               cc_state = get_cc_state(ppd);
-               rcu_assign_pointer(ppd->cc_state, NULL);
-               spin_unlock(&ppd->cc_state_lock);
-
-               if (cc_state)
-                       call_rcu(&cc_state->rcu, cc_state_reclaim);
-       }
-
-       free_credit_return(dd);
-
-       /*
-        * Free any resources still in use (usually just kernel contexts)
-        * at unload; we do for ctxtcnt, because that's what we allocate.
-        * We acquire lock to be really paranoid that rcd isn't being
-        * accessed from some interrupt-related code (that should not happen,
-        * but best to be sure).
-        */
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       tmp = dd->rcd;
-       dd->rcd = NULL;
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-
-       if (dd->rcvhdrtail_dummy_kvaddr) {
-               dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
-                                 (void *)dd->rcvhdrtail_dummy_kvaddr,
-                                 dd->rcvhdrtail_dummy_physaddr);
-                                 dd->rcvhdrtail_dummy_kvaddr = NULL;
-       }
-
-       for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
-               struct hfi1_ctxtdata *rcd = tmp[ctxt];
-
-               tmp[ctxt] = NULL; /* debugging paranoia */
-               if (rcd) {
-                       hfi1_clear_tids(rcd);
-                       hfi1_free_ctxtdata(dd, rcd);
-               }
-       }
-       kfree(tmp);
-       free_pio_map(dd);
-       /* must follow rcv context free - need to remove rcv's hooks */
-       for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
-               sc_free(dd->send_contexts[ctxt].sc);
-       dd->num_send_contexts = 0;
-       kfree(dd->send_contexts);
-       dd->send_contexts = NULL;
-       kfree(dd->hw_to_sw);
-       dd->hw_to_sw = NULL;
-       kfree(dd->boardname);
-       vfree(dd->events);
-       vfree(dd->status);
-}
-
-/*
- * Clean up on unit shutdown, or error during unit load after
- * successful initialization.
- */
-static void postinit_cleanup(struct hfi1_devdata *dd)
-{
-       hfi1_start_cleanup(dd);
-
-       hfi1_pcie_ddcleanup(dd);
-       hfi1_pcie_cleanup(dd->pcidev);
-
-       cleanup_device_data(dd);
-
-       hfi1_free_devdata(dd);
-}
-
-static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret = 0, j, pidx, initfail;
-       struct hfi1_devdata *dd = NULL;
-       struct hfi1_pportdata *ppd;
-
-       /* First, lock the non-writable module parameters */
-       HFI1_CAP_LOCK();
-
-       /* Validate some global module parameters */
-       if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
-               hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-       if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
-               hfi1_early_err(&pdev->dev,
-                              "Receive header queue count cannot be greater than %u\n",
-                              HFI1_MAX_HDRQ_EGRBUF_CNT);
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* use the encoding function as a sanitization check */
-       if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
-               hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
-                              hfi1_hdrq_entsize);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* The receive eager buffer size must be set before the receive
-        * contexts are created.
-        *
-        * Set the eager buffer size.  Validate that it falls in a range
-        * allowed by the hardware - all powers of 2 between the min and
-        * max.  The maximum valid MTU is within the eager buffer range
-        * so we do not need to cap the max_mtu by an eager buffer size
-        * setting.
-        */
-       if (eager_buffer_size) {
-               if (!is_power_of_2(eager_buffer_size))
-                       eager_buffer_size =
-                               roundup_pow_of_two(eager_buffer_size);
-               eager_buffer_size =
-                       clamp_val(eager_buffer_size,
-                                 MIN_EAGER_BUFFER * 8,
-                                 MAX_EAGER_BUFFER_TOTAL);
-               hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
-                               eager_buffer_size);
-       } else {
-               hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* restrict value of hfi1_rcvarr_split */
-       hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
-
-       ret = hfi1_pcie_init(pdev, ent);
-       if (ret)
-               goto bail;
-
-       /*
-        * Do device-specific initialization, function table setup, dd
-        * allocation, etc.
-        */
-       switch (ent->device) {
-       case PCI_DEVICE_ID_INTEL0:
-       case PCI_DEVICE_ID_INTEL1:
-               dd = hfi1_init_dd(pdev, ent);
-               break;
-       default:
-               hfi1_early_err(&pdev->dev,
-                              "Failing on unknown Intel deviceid 0x%x\n",
-                              ent->device);
-               ret = -ENODEV;
-       }
-
-       if (IS_ERR(dd))
-               ret = PTR_ERR(dd);
-       if (ret)
-               goto clean_bail; /* error already printed */
-
-       ret = create_workqueues(dd);
-       if (ret)
-               goto clean_bail;
-
-       /* do the generic initialization */
-       initfail = hfi1_init(dd, 0);
-
-       ret = hfi1_register_ib_device(dd);
-
-       /*
-        * Now ready for use.  this should be cleared whenever we
-        * detect a reset, or initiate one.  If earlier failure,
-        * we still create devices, so diags, etc. can be used
-        * to determine cause of problem.
-        */
-       if (!initfail && !ret) {
-               dd->flags |= HFI1_INITTED;
-               /* create debufs files after init and ib register */
-               hfi1_dbg_ibdev_init(&dd->verbs_dev);
-       }
-
-       j = hfi1_device_create(dd);
-       if (j)
-               dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
-
-       if (initfail || ret) {
-               stop_timers(dd);
-               flush_workqueue(ib_wq);
-               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
-                       hfi1_quiet_serdes(dd->pport + pidx);
-                       ppd = dd->pport + pidx;
-                       if (ppd->hfi1_wq) {
-                               destroy_workqueue(ppd->hfi1_wq);
-                               ppd->hfi1_wq = NULL;
-                       }
-               }
-               if (!j)
-                       hfi1_device_remove(dd);
-               if (!ret)
-                       hfi1_unregister_ib_device(dd);
-               postinit_cleanup(dd);
-               if (initfail)
-                       ret = initfail;
-               goto bail;      /* everything already cleaned */
-       }
-
-       sdma_start(dd);
-
-       return 0;
-
-clean_bail:
-       hfi1_pcie_cleanup(pdev);
-bail:
-       return ret;
-}
-
-static void remove_one(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       /* close debugfs files before ib unregister */
-       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
-       /* unregister from IB core */
-       hfi1_unregister_ib_device(dd);
-
-       /*
-        * Disable the IB link, disable interrupts on the device,
-        * clear dma engines, etc.
-        */
-       shutdown_device(dd);
-
-       stop_timers(dd);
-
-       /* wait until all of our (qsfp) queue_work() calls complete */
-       flush_workqueue(ib_wq);
-
-       hfi1_device_remove(dd);
-
-       postinit_cleanup(dd);
-}
-
-/**
- * hfi1_create_rcvhdrq - create a receive header queue
- * @dd: the hfi1_ib device
- * @rcd: the context data
- *
- * This must be contiguous memory (from an i/o perspective), and must be
- * DMA'able (which means for some systems, it will go through an IOMMU,
- * or be forced into a low address range).
- */
-int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
-{
-       unsigned amt;
-       u64 reg;
-
-       if (!rcd->rcvhdrq) {
-               dma_addr_t phys_hdrqtail;
-               gfp_t gfp_flags;
-
-               /*
-                * rcvhdrqentsize is in DWs, so we have to convert to bytes
-                * (* sizeof(u32)).
-                */
-               amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
-                                sizeof(u32));
-
-               gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
-                       GFP_USER : GFP_KERNEL;
-               rcd->rcvhdrq = dma_zalloc_coherent(
-                       &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
-                       gfp_flags | __GFP_COMP);
-
-               if (!rcd->rcvhdrq) {
-                       dd_dev_err(dd,
-                                  "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
-                                  amt, rcd->ctxt);
-                       goto bail;
-               }
-
-               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-                       rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
-                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
-                               gfp_flags);
-                       if (!rcd->rcvhdrtail_kvaddr)
-                               goto bail_free;
-                       rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
-               }
-
-               rcd->rcvhdrq_size = amt;
-       }
-       /*
-        * These values are per-context:
-        *      RcvHdrCnt
-        *      RcvHdrEntSize
-        *      RcvHdrSize
-        */
-       reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
-                       & RCV_HDR_CNT_CNT_MASK)
-               << RCV_HDR_CNT_CNT_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
-       reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
-                       & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
-               << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
-       reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
-               << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
-
-       /*
-        * Program dummy tail address for every receive context
-        * before enabling any receive context
-        */
-       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
-                       dd->rcvhdrtail_dummy_physaddr);
-
-       return 0;
-
-bail_free:
-       dd_dev_err(dd,
-                  "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
-                  rcd->ctxt);
-       vfree(rcd->user_event_mask);
-       rcd->user_event_mask = NULL;
-       dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
-                         rcd->rcvhdrq_phys);
-       rcd->rcvhdrq = NULL;
-bail:
-       return -ENOMEM;
-}
-
-/**
- * allocate eager buffers, both kernel and user contexts.
- * @rcd: the context we are setting up.
- *
- * Allocate the eager TID buffers and program them into hip.
- * They are no longer completely contiguous, we do multiple allocation
- * calls.  Otherwise we get the OOM code involved, by asking for too
- * much per call, with disastrous results on some kernels.
- */
-int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
-       gfp_t gfp_flags;
-       u16 order;
-       int ret = 0;
-       u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
-
-       /*
-        * GFP_USER, but without GFP_FS, so buffer cache can be
-        * coalesced (we hope); otherwise, even at order 4,
-        * heavy filesystem activity makes these fail, and we can
-        * use compound pages.
-        */
-       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
-
-       /*
-        * The minimum size of the eager buffers is a groups of MTU-sized
-        * buffers.
-        * The global eager_buffer_size parameter is checked against the
-        * theoretical lower limit of the value. Here, we check against the
-        * MTU.
-        */
-       if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
-               rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
-       /*
-        * If using one-pkt-per-egr-buffer, lower the eager buffer
-        * size to the max MTU (page-aligned).
-        */
-       if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
-               rcd->egrbufs.rcvtid_size = round_mtu;
-
-       /*
-        * Eager buffers sizes of 1MB or less require smaller TID sizes
-        * to satisfy the "multiple of 8 RcvArray entries" requirement.
-        */
-       if (rcd->egrbufs.size <= (1 << 20))
-               rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
-                       rounddown_pow_of_two(rcd->egrbufs.size / 8));
-
-       while (alloced_bytes < rcd->egrbufs.size &&
-              rcd->egrbufs.alloced < rcd->egrbufs.count) {
-               rcd->egrbufs.buffers[idx].addr =
-                       dma_zalloc_coherent(&dd->pcidev->dev,
-                                           rcd->egrbufs.rcvtid_size,
-                                           &rcd->egrbufs.buffers[idx].phys,
-                                           gfp_flags);
-               if (rcd->egrbufs.buffers[idx].addr) {
-                       rcd->egrbufs.buffers[idx].len =
-                               rcd->egrbufs.rcvtid_size;
-                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
-                               rcd->egrbufs.buffers[idx].addr;
-                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
-                               rcd->egrbufs.buffers[idx].phys;
-                       rcd->egrbufs.alloced++;
-                       alloced_bytes += rcd->egrbufs.rcvtid_size;
-                       idx++;
-               } else {
-                       u32 new_size, i, j;
-                       u64 offset = 0;
-
-                       /*
-                        * Fail the eager buffer allocation if:
-                        *   - we are already using the lowest acceptable size
-                        *   - we are using one-pkt-per-egr-buffer (this implies
-                        *     that we are accepting only one size)
-                        */
-                       if (rcd->egrbufs.rcvtid_size == round_mtu ||
-                           !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
-                               dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
-                                          rcd->ctxt);
-                               goto bail_rcvegrbuf_phys;
-                       }
-
-                       new_size = rcd->egrbufs.rcvtid_size / 2;
-
-                       /*
-                        * If the first attempt to allocate memory failed, don't
-                        * fail everything but continue with the next lower
-                        * size.
-                        */
-                       if (idx == 0) {
-                               rcd->egrbufs.rcvtid_size = new_size;
-                               continue;
-                       }
-
-                       /*
-                        * Re-partition already allocated buffers to a smaller
-                        * size.
-                        */
-                       rcd->egrbufs.alloced = 0;
-                       for (i = 0, j = 0, offset = 0; j < idx; i++) {
-                               if (i >= rcd->egrbufs.count)
-                                       break;
-                               rcd->egrbufs.rcvtids[i].phys =
-                                       rcd->egrbufs.buffers[j].phys + offset;
-                               rcd->egrbufs.rcvtids[i].addr =
-                                       rcd->egrbufs.buffers[j].addr + offset;
-                               rcd->egrbufs.alloced++;
-                               if ((rcd->egrbufs.buffers[j].phys + offset +
-                                    new_size) ==
-                                   (rcd->egrbufs.buffers[j].phys +
-                                    rcd->egrbufs.buffers[j].len)) {
-                                       j++;
-                                       offset = 0;
-                               } else {
-                                       offset += new_size;
-                               }
-                       }
-                       rcd->egrbufs.rcvtid_size = new_size;
-               }
-       }
-       rcd->egrbufs.numbufs = idx;
-       rcd->egrbufs.size = alloced_bytes;
-
-       hfi1_cdbg(PROC,
-                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
-                 rcd->egrbufs.size);
-
-       /*
-        * Set the contexts rcv array head update threshold to the closest
-        * power of 2 (so we can use a mask instead of modulo) below half
-        * the allocated entries.
-        */
-       rcd->egrbufs.threshold =
-               rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
-       /*
-        * Compute the expected RcvArray entry base. This is done after
-        * allocating the eager buffers in order to maximize the
-        * expected RcvArray entries for the context.
-        */
-       max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
-       egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
-       rcd->expected_count = max_entries - egrtop;
-       if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
-               rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
-
-       rcd->expected_base = rcd->eager_base + egrtop;
-       hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
-                 rcd->eager_base, rcd->expected_base);
-
-       if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
-               hfi1_cdbg(PROC,
-                         "ctxt%u: current Eager buffer size is invalid %u\n",
-                         rcd->ctxt, rcd->egrbufs.rcvtid_size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
-               hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
-                            rcd->egrbufs.rcvtids[idx].phys, order);
-               cond_resched();
-       }
-       goto bail;
-
-bail_rcvegrbuf_phys:
-       for (idx = 0; idx < rcd->egrbufs.alloced &&
-            rcd->egrbufs.buffers[idx].addr;
-            idx++) {
-               dma_free_coherent(&dd->pcidev->dev,
-                                 rcd->egrbufs.buffers[idx].len,
-                                 rcd->egrbufs.buffers[idx].addr,
-                                 rcd->egrbufs.buffers[idx].phys);
-               rcd->egrbufs.buffers[idx].addr = NULL;
-               rcd->egrbufs.buffers[idx].phys = 0;
-               rcd->egrbufs.buffers[idx].len = 0;
-       }
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
deleted file mode 100644 (file)
index 65348d1..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "sdma.h"
-
-/**
- * format_hwmsg - format a single hwerror message
- * @msg message buffer
- * @msgl length of message buffer
- * @hwmsg message to add to message buffer
- */
-static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
-{
-       strlcat(msg, "[", msgl);
-       strlcat(msg, hwmsg, msgl);
-       strlcat(msg, "]", msgl);
-}
-
-/**
- * hfi1_format_hwerrors - format hardware error messages for display
- * @hwerrs hardware errors bit vector
- * @hwerrmsgs hardware error descriptions
- * @nhwerrmsgs number of hwerrmsgs
- * @msg message buffer
- * @msgl message buffer length
- */
-void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
-                         size_t nhwerrmsgs, char *msg, size_t msgl)
-{
-       int i;
-
-       for (i = 0; i < nhwerrmsgs; i++)
-               if (hwerrs & hwerrmsgs[i].mask)
-                       format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
-}
-
-static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
-{
-       struct ib_event event;
-       struct hfi1_devdata *dd = ppd->dd;
-
-       /*
-        * Only call ib_dispatch_event() if the IB device has been
-        * registered.  HFI1_INITED is set iff the driver has successfully
-        * registered with the IB core.
-        */
-       if (!(dd->flags & HFI1_INITTED))
-               return;
-       event.device = &dd->verbs_dev.rdi.ibdev;
-       event.element.port_num = ppd->port;
-       event.event = ev;
-       ib_dispatch_event(&event);
-}
-
-/*
- * Handle a linkup or link down notification.
- * This is called outside an interrupt.
- */
-void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
-{
-       struct hfi1_pportdata *ppd = &dd->pport[0];
-       enum ib_event_type ev;
-
-       if (!(ppd->linkup ^ !!linkup))
-               return; /* no change, nothing to do */
-
-       if (linkup) {
-               /*
-                * Quick linkup and all link up on the simulator does not
-                * trigger or implement:
-                *      - VerifyCap interrupt
-                *      - VerifyCap frames
-                * But rather moves directly to LinkUp.
-                *
-                * Do the work of the VerifyCap interrupt handler,
-                * handle_verify_cap(), but do not try moving the state to
-                * LinkUp as we are already there.
-                *
-                * NOTE: This uses this device's vAU, vCU, and vl15_init for
-                * the remote values.  Both sides must be using the values.
-                */
-               if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-                       set_up_vl15(dd, dd->vau, dd->vl15_init);
-                       assign_remote_cm_au_table(dd, dd->vcu);
-                       ppd->neighbor_guid =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
-                       ppd->neighbor_type =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
-                                       DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
-                       ppd->neighbor_port_number =
-                               read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
-                                        DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
-                       dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n",
-                                   ppd->neighbor_guid,
-                                   ppd->neighbor_type);
-               }
-
-               /* physical link went up */
-               ppd->linkup = 1;
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-
-               /* link widths are not available until the link is fully up */
-               get_linkup_link_widths(ppd);
-
-       } else {
-               /* physical link went down */
-               ppd->linkup = 0;
-
-               /* clear HW details of the previous connection */
-               reset_link_credits(dd);
-
-               /* freeze after a link down to guarantee a clean egress */
-               start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
-
-               ev = IB_EVENT_PORT_ERR;
-
-               hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
-
-               /* if we are down, the neighbor is down */
-               ppd->neighbor_normal = 0;
-
-               /* notify IB of the link change */
-               signal_ib_event(ppd, ev);
-       }
-}
-
-/*
- * Handle receive or urgent interrupts for user contexts.  This means a user
- * process was waiting for a packet to arrive, and didn't want to poll.
- */
-void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_devdata *dd = rcd->dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->uctxt_lock, flags);
-       if (!rcd->cnt)
-               goto done;
-
-       if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
-               wake_up_interruptible(&rcd->wait);
-               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
-       } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
-                                                       &rcd->event_flags)) {
-               rcd->urgent++;
-               wake_up_interruptible(&rcd->wait);
-       }
-done:
-       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
-}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
deleted file mode 100644 (file)
index 2ec6ef3..0000000
+++ /dev/null
@@ -1,300 +0,0 @@
-#ifndef _HFI1_IOWAIT_H
-#define _HFI1_IOWAIT_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/list.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-#include "sdma_txreq.h"
-
-/*
- * typedef (*restart_t)() - restart callback
- * @work: pointer to work structure
- */
-typedef void (*restart_t)(struct work_struct *work);
-
-struct sdma_txreq;
-struct sdma_engine;
-/**
- * struct iowait - linkage for delayed progress/waiting
- * @list: used to add/insert into QP/PQ wait lists
- * @tx_head: overflow list of sdma_txreq's
- * @sleep: no space callback
- * @wakeup: space callback wakeup
- * @sdma_drained: sdma count drained
- * @iowork: workqueue overhead
- * @wait_dma: wait for sdma_busy == 0
- * @wait_pio: wait for pio_busy == 0
- * @sdma_busy: # of packets in flight
- * @count: total number of descriptors in tx_head'ed list
- * @tx_limit: limit for overflow queuing
- * @tx_count: number of tx entry's in tx_head'ed list
- *
- * This is to be embedded in user's state structure
- * (QP or PQ).
- *
- * The sleep and wakeup members are a
- * bit misnamed.   They do not strictly
- * speaking sleep or wake up, but they
- * are callbacks for the ULP to implement
- * what ever queuing/dequeuing of
- * the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
- *
- * Both potentially have locks help
- * so sleeping is not allowed.
- *
- * The wait_dma member along with the iow
- */
-
-struct iowait {
-       struct list_head list;
-       struct list_head tx_head;
-       int (*sleep)(
-               struct sdma_engine *sde,
-               struct iowait *wait,
-               struct sdma_txreq *tx,
-               unsigned seq);
-       void (*wakeup)(struct iowait *wait, int reason);
-       void (*sdma_drained)(struct iowait *wait);
-       struct work_struct iowork;
-       wait_queue_head_t wait_dma;
-       wait_queue_head_t wait_pio;
-       atomic_t sdma_busy;
-       atomic_t pio_busy;
-       u32 count;
-       u32 tx_limit;
-       u32 tx_count;
-};
-
-#define SDMA_AVAIL_REASON 0
-
-/**
- * iowait_init() - initialize wait structure
- * @wait: wait struct to initialize
- * @tx_limit: limit for overflow queuing
- * @func: restart function for workqueue
- * @sleep: sleep function for no space
- * @resume: wakeup function for no space
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
-
-static inline void iowait_init(
-       struct iowait *wait,
-       u32 tx_limit,
-       void (*func)(struct work_struct *work),
-       int (*sleep)(
-               struct sdma_engine *sde,
-               struct iowait *wait,
-               struct sdma_txreq *tx,
-               unsigned seq),
-       void (*wakeup)(struct iowait *wait, int reason),
-       void (*sdma_drained)(struct iowait *wait))
-{
-       wait->count = 0;
-       INIT_LIST_HEAD(&wait->list);
-       INIT_LIST_HEAD(&wait->tx_head);
-       INIT_WORK(&wait->iowork, func);
-       init_waitqueue_head(&wait->wait_dma);
-       init_waitqueue_head(&wait->wait_pio);
-       atomic_set(&wait->sdma_busy, 0);
-       atomic_set(&wait->pio_busy, 0);
-       wait->tx_limit = tx_limit;
-       wait->sleep = sleep;
-       wait->wakeup = wakeup;
-       wait->sdma_drained = sdma_drained;
-}
-
-/**
- * iowait_schedule() - initialize wait structure
- * @wait: wait struct to schedule
- * @wq: workqueue for schedule
- * @cpu: cpu
- */
-static inline void iowait_schedule(
-       struct iowait *wait,
-       struct workqueue_struct *wq,
-       int cpu)
-{
-       queue_work_on(cpu, wq, &wait->iowork);
-}
-
-/**
- * iowait_sdma_drain() - wait for DMAs to drain
- *
- * @wait: iowait structure
- *
- * This will delay until the iowait sdmas have
- * completed.
- */
-static inline void iowait_sdma_drain(struct iowait *wait)
-{
-       wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
-}
-
-/**
- * iowait_sdma_pending() - return sdma pending count
- *
- * @wait: iowait structure
- *
- */
-static inline int iowait_sdma_pending(struct iowait *wait)
-{
-       return atomic_read(&wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_inc - note sdma io pending
- * @wait: iowait structure
- */
-static inline void iowait_sdma_inc(struct iowait *wait)
-{
-       atomic_inc(&wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_add - add count to pending
- * @wait: iowait structure
- */
-static inline void iowait_sdma_add(struct iowait *wait, int count)
-{
-       atomic_add(count, &wait->sdma_busy);
-}
-
-/**
- * iowait_sdma_dec - note sdma complete
- * @wait: iowait structure
- */
-static inline int iowait_sdma_dec(struct iowait *wait)
-{
-       return atomic_dec_and_test(&wait->sdma_busy);
-}
-
-/**
- * iowait_pio_drain() - wait for pios to drain
- *
- * @wait: iowait structure
- *
- * This will delay until the iowait pios have
- * completed.
- */
-static inline void iowait_pio_drain(struct iowait *wait)
-{
-       wait_event_timeout(wait->wait_pio,
-                          !atomic_read(&wait->pio_busy),
-                          HZ);
-}
-
-/**
- * iowait_pio_pending() - return pio pending count
- *
- * @wait: iowait structure
- *
- */
-static inline int iowait_pio_pending(struct iowait *wait)
-{
-       return atomic_read(&wait->pio_busy);
-}
-
-/**
- * iowait_pio_inc - note pio pending
- * @wait: iowait structure
- */
-static inline void iowait_pio_inc(struct iowait *wait)
-{
-       atomic_inc(&wait->pio_busy);
-}
-
-/**
- * iowait_sdma_dec - note pio complete
- * @wait: iowait structure
- */
-static inline int iowait_pio_dec(struct iowait *wait)
-{
-       return atomic_dec_and_test(&wait->pio_busy);
-}
-
-/**
- * iowait_drain_wakeup() - trigger iowait_drain() waiter
- *
- * @wait: iowait structure
- *
- * This will trigger any waiters.
- */
-static inline void iowait_drain_wakeup(struct iowait *wait)
-{
-       wake_up(&wait->wait_dma);
-       wake_up(&wait->wait_pio);
-       if (wait->sdma_drained)
-               wait->sdma_drained(wait);
-}
-
-/**
- * iowait_get_txhead() - get packet off of iowait list
- *
- * @wait wait struture
- */
-static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
-{
-       struct sdma_txreq *tx = NULL;
-
-       if (!list_empty(&wait->tx_head)) {
-               tx = list_first_entry(
-                       &wait->tx_head,
-                       struct sdma_txreq,
-                       list);
-               list_del_init(&tx->list);
-       }
-       return tx;
-}
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
deleted file mode 100644 (file)
index ed58cf2..0000000
+++ /dev/null
@@ -1,4416 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/net.h>
-#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
-                       / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
-
-#include "hfi.h"
-#include "mad.h"
-#include "trace.h"
-#include "qp.h"
-
-/* the reset value from the FM is supposed to be 0xffff, handle both */
-#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
-#define OPA_LINK_WIDTH_RESET 0xffff
-
-static int reply(struct ib_mad_hdr *smp)
-{
-       /*
-        * The verbs framework will handle the directed/LID route
-        * packet changes.
-        */
-       smp->method = IB_MGMT_METHOD_GET_RESP;
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-               smp->status |= IB_SMP_DIRECTION;
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-static inline void clear_opa_smp_data(struct opa_smp *smp)
-{
-       void *data = opa_get_smp_data(smp);
-       size_t size = opa_get_smp_data_size(smp);
-
-       memset(data, 0, size);
-}
-
-static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
-{
-       struct ib_mad_send_buf *send_buf;
-       struct ib_mad_agent *agent;
-       struct opa_smp *smp;
-       int ret;
-       unsigned long flags;
-       unsigned long timeout;
-       int pkey_idx;
-       u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
-
-       agent = ibp->rvp.send_agent;
-       if (!agent)
-               return;
-
-       /* o14-3.2.1 */
-       if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
-               return;
-
-       /* o14-2 */
-       if (ibp->rvp.trap_timeout && time_before(jiffies,
-                                                ibp->rvp.trap_timeout))
-               return;
-
-       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
-       if (pkey_idx < 0) {
-               pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
-                       __func__, hfi1_get_pkey(ibp, 1));
-               pkey_idx = 1;
-       }
-
-       send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
-                                     IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
-                                     GFP_ATOMIC, IB_MGMT_BASE_VERSION);
-       if (IS_ERR(send_buf))
-               return;
-
-       smp = send_buf->mad;
-       smp->base_version = OPA_MGMT_BASE_VERSION;
-       smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
-       smp->class_version = OPA_SMI_CLASS_VERSION;
-       smp->method = IB_MGMT_METHOD_TRAP;
-       ibp->rvp.tid++;
-       smp->tid = cpu_to_be64(ibp->rvp.tid);
-       smp->attr_id = IB_SMP_ATTR_NOTICE;
-       /* o14-1: smp->mkey = 0; */
-       memcpy(smp->route.lid.data, data, len);
-
-       spin_lock_irqsave(&ibp->rvp.lock, flags);
-       if (!ibp->rvp.sm_ah) {
-               if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
-                       struct ib_ah *ah;
-
-                       ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
-                       if (IS_ERR(ah)) {
-                               ret = PTR_ERR(ah);
-                       } else {
-                               send_buf->ah = ah;
-                               ibp->rvp.sm_ah = ibah_to_rvtah(ah);
-                               ret = 0;
-                       }
-               } else {
-                       ret = -EINVAL;
-               }
-       } else {
-               send_buf->ah = &ibp->rvp.sm_ah->ibah;
-               ret = 0;
-       }
-       spin_unlock_irqrestore(&ibp->rvp.lock, flags);
-
-       if (!ret)
-               ret = ib_post_send_mad(send_buf, NULL);
-       if (!ret) {
-               /* 4.096 usec. */
-               timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
-               ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
-       } else {
-               ib_free_send_mad(send_buf);
-               ibp->rvp.trap_timeout = 0;
-       }
-}
-
-/*
- * Send a bad [PQ]_Key trap (ch. 14.3.8).
- */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-                   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-       u32 _lid1 = lid1;
-       u32 _lid2 = lid2;
-
-       memset(&data, 0, sizeof(data));
-
-       if (trap_num == OPA_TRAP_BAD_P_KEY)
-               ibp->rvp.pkey_violations++;
-       else
-               ibp->rvp.qkey_violations++;
-       ibp->rvp.n_pkt_drops++;
-
-       /* Send violation trap */
-       data.generic_type = IB_NOTICE_TYPE_SECURITY;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = trap_num;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
-       data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
-       data.ntc_257_258.key = cpu_to_be32(key);
-       data.ntc_257_258.sl = sl << 3;
-       data.ntc_257_258.qp1 = cpu_to_be32(qp1);
-       data.ntc_257_258.qp2 = cpu_to_be32(qp2);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a bad M_Key trap (ch. 14.3.9).
- */
-static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
-                    __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-       /* Send violation trap */
-       data.generic_type = IB_NOTICE_TYPE_SECURITY;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_BAD_M_KEY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_256.lid = data.issuer_lid;
-       data.ntc_256.method = mad->method;
-       data.ntc_256.attr_id = mad->attr_id;
-       data.ntc_256.attr_mod = mad->attr_mod;
-       data.ntc_256.mkey = mkey;
-       if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-               data.ntc_256.dr_slid = dr_slid;
-               data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
-               if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
-                       data.ntc_256.dr_trunc_hop |=
-                               IB_NOTICE_TRAP_DR_TRUNC;
-                       hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
-               }
-               data.ntc_256.dr_trunc_hop |= hop_cnt;
-               memcpy(data.ntc_256.dr_rtn_path, return_path,
-                      hop_cnt);
-       }
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a Port Capability Mask Changed trap (ch. 14.3.11).
- */
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
-{
-       struct opa_mad_notice_attr data;
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_144.lid = data.issuer_lid;
-       data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a System Image GUID Changed trap (ch. 14.3.12).
- */
-void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
-       data.ntc_145.lid = data.issuer_lid;
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-/*
- * Send a Node Description Changed trap (ch. 14.3.13).
- */
-void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
-{
-       struct opa_mad_notice_attr data;
-       u32 lid = ppd_from_ibp(ibp)->lid;
-
-       memset(&data, 0, sizeof(data));
-
-       data.generic_type = IB_NOTICE_TYPE_INFO;
-       data.prod_type_lsb = IB_NOTICE_PROD_CA;
-       data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-       data.issuer_lid = cpu_to_be32(lid);
-       data.ntc_144.lid = data.issuer_lid;
-       data.ntc_144.change_flags =
-               cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
-
-       send_trap(ibp, &data, sizeof(data));
-}
-
-static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
-                                  u8 *data, struct ib_device *ibdev,
-                                  u8 port, u32 *resp_len)
-{
-       struct opa_node_description *nd;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       nd = (struct opa_node_description *)data;
-
-       memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
-
-       if (resp_len)
-               *resp_len += sizeof(*nd);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct opa_node_info *ni;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
-
-       ni = (struct opa_node_info *)data;
-
-       /* GUID 0 is illegal */
-       if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
-       ni->base_version = OPA_MGMT_BASE_VERSION;
-       ni->class_version = OPA_SMI_CLASS_VERSION;
-       ni->node_type = 1;     /* channel adapter */
-       ni->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       ni->system_image_guid = ib_hfi1_sys_image_guid;
-       /* Use first-port GUID as node */
-       ni->node_guid = cpu_to_be64(dd->pport->guid);
-       ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
-       ni->device_id = cpu_to_be16(dd->pcidev->device);
-       ni->revision = cpu_to_be32(dd->minrev);
-       ni->local_port_num = port;
-       ni->vendor_id[0] = dd->oui1;
-       ni->vendor_id[1] = dd->oui2;
-       ni->vendor_id[2] = dd->oui3;
-
-       if (resp_len)
-               *resp_len += sizeof(*ni);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
-                            u8 port)
-{
-       struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
-
-       /* GUID 0 is illegal */
-       if (smp->attr_mod || pidx >= dd->num_pports ||
-           dd->pport[pidx].guid == 0)
-               smp->status |= IB_SMP_INVALID_FIELD;
-       else
-               nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
-
-       nip->base_version = OPA_MGMT_BASE_VERSION;
-       nip->class_version = OPA_SMI_CLASS_VERSION;
-       nip->node_type = 1;     /* channel adapter */
-       nip->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       nip->sys_guid = ib_hfi1_sys_image_guid;
-        /* Use first-port GUID as node */
-       nip->node_guid = cpu_to_be64(dd->pport->guid);
-       nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
-       nip->device_id = cpu_to_be16(dd->pcidev->device);
-       nip->revision = cpu_to_be32(dd->minrev);
-       nip->local_port_num = port;
-       nip->vendor_id[0] = dd->oui1;
-       nip->vendor_id[1] = dd->oui2;
-       nip->vendor_id[2] = dd->oui3;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
-}
-
-static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
-}
-
-static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
-{
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
-}
-
-static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
-                     int mad_flags, __be64 mkey, __be32 dr_slid,
-                     u8 return_path[], u8 hop_cnt)
-{
-       int valid_mkey = 0;
-       int ret = 0;
-
-       /* Is the mkey in the process of expiring? */
-       if (ibp->rvp.mkey_lease_timeout &&
-           time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
-               /* Clear timeout and mkey protection field. */
-               ibp->rvp.mkey_lease_timeout = 0;
-               ibp->rvp.mkeyprot = 0;
-       }
-
-       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
-           ibp->rvp.mkey == mkey)
-               valid_mkey = 1;
-
-       /* Unset lease timeout on any valid Get/Set/TrapRepress */
-       if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
-           (mad->method == IB_MGMT_METHOD_GET ||
-            mad->method == IB_MGMT_METHOD_SET ||
-            mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
-               ibp->rvp.mkey_lease_timeout = 0;
-
-       if (!valid_mkey) {
-               switch (mad->method) {
-               case IB_MGMT_METHOD_GET:
-                       /* Bad mkey not a violation below level 2 */
-                       if (ibp->rvp.mkeyprot < 2)
-                               break;
-               case IB_MGMT_METHOD_SET:
-               case IB_MGMT_METHOD_TRAP_REPRESS:
-                       if (ibp->rvp.mkey_violations != 0xFFFF)
-                               ++ibp->rvp.mkey_violations;
-                       if (!ibp->rvp.mkey_lease_timeout &&
-                           ibp->rvp.mkey_lease_period)
-                               ibp->rvp.mkey_lease_timeout = jiffies +
-                                       ibp->rvp.mkey_lease_period * HZ;
-                       /* Generate a trap notice. */
-                       bad_mkey(ibp, mad, mkey, dr_slid, return_path,
-                                hop_cnt);
-                       ret = 1;
-               }
-       }
-
-       return ret;
-}
-
-/*
- * The SMA caches reads from LCB registers in case the LCB is unavailable.
- * (The LCB is unavailable in certain link states, for example.)
- */
-struct lcb_datum {
-       u32 off;
-       u64 val;
-};
-
-static struct lcb_datum lcb_cache[] = {
-       { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
-};
-
-static int write_lcb_cache(u32 off, u64 val)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
-               if (lcb_cache[i].off == off) {
-                       lcb_cache[i].val = val;
-                       return 0;
-               }
-       }
-
-       pr_warn("%s bad offset 0x%x\n", __func__, off);
-       return -1;
-}
-
-static int read_lcb_cache(u32 off, u64 *val)
-{
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
-               if (lcb_cache[i].off == off) {
-                       *val = lcb_cache[i].val;
-                       return 0;
-               }
-       }
-
-       pr_warn("%s bad offset 0x%x\n", __func__, off);
-       return -1;
-}
-
-void read_ltp_rtt(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
-               dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
-       else
-               write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
-}
-
-static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       int i;
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       struct opa_port_info *pi = (struct opa_port_info *)data;
-       u8 mtu;
-       u8 credit_rate;
-       u8 is_beaconing_active;
-       u32 state;
-       u32 num_ports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 buffer_units;
-       u64 tmp = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       dd = dd_from_ibdev(ibdev);
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       ibp = &ppd->ibport_data;
-
-       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       pi->lid = cpu_to_be32(ppd->lid);
-
-       /* Only return the mkey if the protection field allows it. */
-       if (!(smp->method == IB_MGMT_METHOD_GET &&
-             ibp->rvp.mkey != smp->mkey &&
-             ibp->rvp.mkeyprot == 1))
-               pi->mkey = ibp->rvp.mkey;
-
-       pi->subnet_prefix = ibp->rvp.gid_prefix;
-       pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
-       pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
-       pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
-       pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
-       pi->sa_qp = cpu_to_be32(ppd->sa_qp);
-
-       pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
-       pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
-       pi->link_width.active = cpu_to_be16(ppd->link_width_active);
-
-       pi->link_width_downgrade.supported =
-                       cpu_to_be16(ppd->link_width_downgrade_supported);
-       pi->link_width_downgrade.enabled =
-                       cpu_to_be16(ppd->link_width_downgrade_enabled);
-       pi->link_width_downgrade.tx_active =
-                       cpu_to_be16(ppd->link_width_downgrade_tx_active);
-       pi->link_width_downgrade.rx_active =
-                       cpu_to_be16(ppd->link_width_downgrade_rx_active);
-
-       pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
-       pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
-       pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
-
-       state = driver_lstate(ppd);
-
-       if (start_of_sm_config && (state == IB_PORT_INIT))
-               ppd->is_sm_config_started = 1;
-
-       pi->port_phys_conf = (ppd->port_type & 0xf);
-
-#if PI_LED_ENABLE_SUP
-       pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
-       pi->port_states.ledenable_offlinereason |=
-               ppd->is_sm_config_started << 5;
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
-       pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
-       pi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason;
-#else
-       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
-
-       pi->port_states.portphysstate_portstate =
-               (hfi1_ibphys_portstate(ppd) << 4) | state;
-
-       pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
-
-       memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
-       for (i = 0; i < ppd->vls_supported; i++) {
-               mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
-               if ((i % 2) == 0)
-                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
-               else
-                       pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
-       }
-       /* don't forget VL 15 */
-       mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
-       pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
-       pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
-       pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
-       pi->partenforce_filterraw |=
-               (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
-       if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
-               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
-       if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
-               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
-       pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
-       /* P_KeyViolations are counted by hardware. */
-       pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
-       pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
-
-       pi->vl.cap = ppd->vls_supported;
-       pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
-       pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
-       pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
-
-       pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
-
-       pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
-                                         OPA_PORT_LINK_MODE_OPA << 5 |
-                                         OPA_PORT_LINK_MODE_OPA);
-
-       pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
-
-       pi->port_mode = cpu_to_be16(
-                               ppd->is_active_optimize_enabled ?
-                                       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
-
-       pi->port_packet_format.supported =
-               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
-       pi->port_packet_format.enabled =
-               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
-
-       /* flit_control.interleave is (OPA V1, version .76):
-        * bits         use
-        * ----         ---
-        * 2            res
-        * 2            DistanceSupported
-        * 2            DistanceEnabled
-        * 5            MaxNextLevelTxEnabled
-        * 5            MaxNestLevelRxSupported
-        *
-        * HFI supports only "distance mode 1" (see OPA V1, version .76,
-        * section 9.6.2), so set DistanceSupported, DistanceEnabled
-        * to 0x1.
-        */
-       pi->flit_control.interleave = cpu_to_be16(0x1400);
-
-       pi->link_down_reason = ppd->local_link_down_reason.sma;
-       pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
-       pi->port_error_action = cpu_to_be32(ppd->port_error_action);
-       pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
-
-       /* 32.768 usec. response time (guessing) */
-       pi->resptimevalue = 3;
-
-       pi->local_port_num = port;
-
-       /* buffer info for FM */
-       pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
-
-       pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
-       pi->neigh_port_num = ppd->neighbor_port_number;
-       pi->port_neigh_mode =
-               (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
-               (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
-               (ppd->neighbor_fm_security ?
-                       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
-
-       /* HFIs shall always return VL15 credits to their
-        * neighbor in a timely manner, without any credit return pacing.
-        */
-       credit_rate = 0;
-       buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
-       buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
-       buffer_units |= (credit_rate << 6) &
-                               OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
-       buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
-       pi->buffer_units = cpu_to_be32(buffer_units);
-
-       pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
-
-       /* HFI supports a replay buffer 128 LTPs in size */
-       pi->replay_depth.buffer = 0x80;
-       /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
-       read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
-
-       /*
-        * this counter is 16 bits wide, but the replay_depth.wire
-        * variable is only 8 bits
-        */
-       if (tmp > 0xff)
-               tmp = 0xff;
-       pi->replay_depth.wire = tmp;
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_port_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-/**
- * get_pkeys - return the PKEY table
- * @dd: the hfi1_ib device
- * @port: the IB port number
- * @pkeys: the pkey table is placed here
- */
-static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
-{
-       struct hfi1_pportdata *ppd = dd->pport + port - 1;
-
-       memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
-
-       return 0;
-}
-
-static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 n_blocks_req = OPA_AM_NBLK(am);
-       u32 start_block = am & 0x7ff;
-       __be16 *p;
-       u16 *q;
-       int i;
-       u16 n_blocks_avail;
-       unsigned npkeys = hfi1_get_npkeys(dd);
-       size_t size;
-
-       if (n_blocks_req == 0) {
-               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
-                       port, start_block, n_blocks_req);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
-
-       size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
-
-       if (start_block + n_blocks_req > n_blocks_avail ||
-           n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
-               pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
-                       "avail 0x%x; blk/smp 0x%lx\n",
-                       start_block, n_blocks_req, n_blocks_avail,
-                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       p = (__be16 *)data;
-       q = (u16 *)data;
-       /* get the real pkeys if we are requesting the first block */
-       if (start_block == 0) {
-               get_pkeys(dd, port, q);
-               for (i = 0; i < npkeys; i++)
-                       p[i] = cpu_to_be16(q[i]);
-               if (resp_len)
-                       *resp_len += size;
-       } else {
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-enum {
-       HFI_TRANSITION_DISALLOWED,
-       HFI_TRANSITION_IGNORED,
-       HFI_TRANSITION_ALLOWED,
-       HFI_TRANSITION_UNDEFINED,
-};
-
-/*
- * Use shortened names to improve readability of
- * {logical,physical}_state_transitions
- */
-enum {
-       __D = HFI_TRANSITION_DISALLOWED,
-       __I = HFI_TRANSITION_IGNORED,
-       __A = HFI_TRANSITION_ALLOWED,
-       __U = HFI_TRANSITION_UNDEFINED,
-};
-
-/*
- * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
- * represented in physical_state_transitions.
- */
-#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
-
-/*
- * Within physical_state_transitions, rows represent "old" states,
- * columns "new" states, and physical_state_transitions.allowed[old][new]
- * indicates if the transition from old state to new state is legal (see
- * OPAg1v1, Table 6-4).
- */
-static const struct {
-       u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
-} physical_state_transitions = {
-       {
-               /* 2    3    4    5    6    7    8    9   10   11 */
-       /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
-       /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
-       /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
-       /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
-       /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
-       /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
-       /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
-       }
-};
-
-/*
- * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
- * logical_state_transitions
- */
-
-#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
-
-/*
- * Within logical_state_transitions rows represent "old" states,
- * columns "new" states, and logical_state_transitions.allowed[old][new]
- * indicates if the transition from old state to new state is legal (see
- * OPAg1v1, Table 9-12).
- */
-static const struct {
-       u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
-} logical_state_transitions = {
-       {
-               /* 1    2    3    4    5 */
-       /* 1 */ { __I, __D, __D, __D, __U},
-       /* 2 */ { __D, __I, __A, __D, __U},
-       /* 3 */ { __D, __D, __I, __A, __U},
-       /* 4 */ { __D, __D, __I, __I, __U},
-       /* 5 */ { __U, __U, __U, __U, __U},
-       }
-};
-
-static int logical_transition_allowed(int old, int new)
-{
-       if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
-           new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
-               pr_warn("invalid logical state(s) (old %d new %d)\n",
-                       old, new);
-               return HFI_TRANSITION_UNDEFINED;
-       }
-
-       if (new == IB_PORT_NOP)
-               return HFI_TRANSITION_ALLOWED; /* always allowed */
-
-       /* adjust states for indexing into logical_state_transitions */
-       old -= IB_PORT_DOWN;
-       new -= IB_PORT_DOWN;
-
-       if (old < 0 || new < 0)
-               return HFI_TRANSITION_UNDEFINED;
-       return logical_state_transitions.allowed[old][new];
-}
-
-static int physical_transition_allowed(int old, int new)
-{
-       if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
-           new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
-               pr_warn("invalid physical state(s) (old %d new %d)\n",
-                       old, new);
-               return HFI_TRANSITION_UNDEFINED;
-       }
-
-       if (new == IB_PORTPHYSSTATE_NOP)
-               return HFI_TRANSITION_ALLOWED; /* always allowed */
-
-       /* adjust states for indexing into physical_state_transitions */
-       old -= IB_PORTPHYSSTATE_POLLING;
-       new -= IB_PORTPHYSSTATE_POLLING;
-
-       if (old < 0 || new < 0)
-               return HFI_TRANSITION_UNDEFINED;
-       return physical_state_transitions.allowed[old][new];
-}
-
-static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
-                                         u32 logical_new, u32 physical_new)
-{
-       u32 physical_old = driver_physical_state(ppd);
-       u32 logical_old = driver_logical_state(ppd);
-       int ret, logical_allowed, physical_allowed;
-
-       ret = logical_transition_allowed(logical_old, logical_new);
-       logical_allowed = ret;
-
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               pr_warn("invalid logical state transition %s -> %s\n",
-                       opa_lstate_name(logical_old),
-                       opa_lstate_name(logical_new));
-               return ret;
-       }
-
-       ret = physical_transition_allowed(physical_old, physical_new);
-       physical_allowed = ret;
-
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               pr_warn("invalid physical state transition %s -> %s\n",
-                       opa_pstate_name(physical_old),
-                       opa_pstate_name(physical_new));
-               return ret;
-       }
-
-       if (logical_allowed == HFI_TRANSITION_IGNORED &&
-           physical_allowed == HFI_TRANSITION_IGNORED)
-               return HFI_TRANSITION_IGNORED;
-
-       /*
-        * A change request of Physical Port State from
-        * 'Offline' to 'Polling' should be ignored.
-        */
-       if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
-           (physical_new == IB_PORTPHYSSTATE_POLLING))
-               return HFI_TRANSITION_IGNORED;
-
-       /*
-        * Either physical_allowed or logical_allowed is
-        * HFI_TRANSITION_ALLOWED.
-        */
-       return HFI_TRANSITION_ALLOWED;
-}
-
-static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
-                          u32 logical_state, u32 phys_state,
-                          int suppress_idle_sma)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 link_state;
-       int ret;
-
-       ret = port_states_transition_allowed(ppd, logical_state, phys_state);
-       if (ret == HFI_TRANSITION_DISALLOWED ||
-           ret == HFI_TRANSITION_UNDEFINED) {
-               /* error message emitted above */
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return 0;
-       }
-
-       if (ret == HFI_TRANSITION_IGNORED)
-               return 0;
-
-       if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
-           !(logical_state == IB_PORT_DOWN ||
-             logical_state == IB_PORT_NOP)){
-               pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
-                       logical_state, phys_state);
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       /*
-        * Logical state changes are summarized in OPAv1g1 spec.,
-        * Table 9-12; physical state changes are summarized in
-        * OPAv1g1 spec., Table 6.4.
-        */
-       switch (logical_state) {
-       case IB_PORT_NOP:
-               if (phys_state == IB_PORTPHYSSTATE_NOP)
-                       break;
-               /* FALLTHROUGH */
-       case IB_PORT_DOWN:
-               if (phys_state == IB_PORTPHYSSTATE_NOP) {
-                       link_state = HLS_DN_DOWNDEF;
-               } else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
-                       link_state = HLS_DN_POLL;
-                       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
-                                            0, OPA_LINKDOWN_REASON_FM_BOUNCE);
-               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
-                       link_state = HLS_DN_DISABLE;
-               } else {
-                       pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
-                               phys_state);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       break;
-               }
-
-               if ((link_state == HLS_DN_POLL ||
-                    link_state == HLS_DN_DOWNDEF)) {
-                       /*
-                        * Going to poll.  No matter what the current state,
-                        * always move offline first, then tune and start the
-                        * link.  This correctly handles a FM link bounce and
-                        * a link enable.  Going offline is a no-op if already
-                        * offline.
-                        */
-                       set_link_state(ppd, HLS_DN_OFFLINE);
-                       tune_serdes(ppd);
-                       start_link(ppd);
-               } else {
-                       set_link_state(ppd, link_state);
-               }
-               if (link_state == HLS_DN_DISABLE &&
-                   (ppd->offline_disabled_reason >
-                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
-                    ppd->offline_disabled_reason ==
-                    HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
-                       ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
-               /*
-                * Don't send a reply if the response would be sent
-                * through the disabled port.
-                */
-               if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               break;
-       case IB_PORT_ARMED:
-               ret = set_link_state(ppd, HLS_UP_ARMED);
-               if ((ret == 0) && (suppress_idle_sma == 0))
-                       send_idle_sma(dd, SMA_IDLE_ARM);
-               break;
-       case IB_PORT_ACTIVE:
-               if (ppd->neighbor_normal) {
-                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
-                       if (ret == 0)
-                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
-               } else {
-                       pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               }
-               break;
-       default:
-               pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
-                       logical_state);
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       return 0;
-}
-
-/**
- * subn_set_opa_portinfo - set port information
- * @smp: the incoming SM packet
- * @ibdev: the infiniband device
- * @port: the port on the device
- *
- */
-static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct opa_port_info *pi = (struct opa_port_info *)data;
-       struct ib_event event;
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       u8 clientrereg;
-       unsigned long flags;
-       u32 smlid, opa_lid; /* tmp vars to hold LID values */
-       u16 lid;
-       u8 ls_old, ls_new, ps_new;
-       u8 vls;
-       u8 msl;
-       u8 crc_enabled;
-       u16 lse, lwe, mtu;
-       u32 num_ports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       int ret, i, invalid = 0, call_set_mtu = 0;
-       int call_link_downgrade_policy = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       opa_lid = be32_to_cpu(pi->lid);
-       if (opa_lid & 0xFFFF0000) {
-               pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               goto get_only;
-       }
-
-       lid = (u16)(opa_lid & 0x0000FFFF);
-
-       smlid = be32_to_cpu(pi->sm_lid);
-       if (smlid & 0xFFFF0000) {
-               pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               goto get_only;
-       }
-       smlid &= 0x0000FFFF;
-
-       clientrereg = (pi->clientrereg_subnettimeout &
-                       OPA_PI_MASK_CLIENT_REREGISTER);
-
-       dd = dd_from_ibdev(ibdev);
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       ibp = &ppd->ibport_data;
-       event.device = ibdev;
-       event.element.port_num = port;
-
-       ls_old = driver_lstate(ppd);
-
-       ibp->rvp.mkey = pi->mkey;
-       ibp->rvp.gid_prefix = pi->subnet_prefix;
-       ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
-
-       /* Must be a valid unicast LID address. */
-       if ((lid == 0 && ls_old > IB_PORT_INIT) ||
-           lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
-                       lid);
-       } else if (ppd->lid != lid ||
-                ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
-               if (ppd->lid != lid)
-                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
-               if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
-                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
-               hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
-               event.event = IB_EVENT_LID_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       msl = pi->smsl & OPA_PI_MASK_SMSL;
-       if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
-               ppd->linkinit_reason =
-                       (pi->partenforce_filterraw &
-                        OPA_PI_MASK_LINKINIT_REASON);
-       /* enable/disable SW pkey checking as per FM control */
-       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
-               ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
-       else
-               ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
-
-       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
-               ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
-       else
-               ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
-
-       /* Must be a valid unicast LID address. */
-       if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
-           smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
-       } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
-               pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
-               spin_lock_irqsave(&ibp->rvp.lock, flags);
-               if (ibp->rvp.sm_ah) {
-                       if (smlid != ibp->rvp.sm_lid)
-                               ibp->rvp.sm_ah->attr.dlid = smlid;
-                       if (msl != ibp->rvp.sm_sl)
-                               ibp->rvp.sm_ah->attr.sl = msl;
-               }
-               spin_unlock_irqrestore(&ibp->rvp.lock, flags);
-               if (smlid != ibp->rvp.sm_lid)
-                       ibp->rvp.sm_lid = smlid;
-               if (msl != ibp->rvp.sm_sl)
-                       ibp->rvp.sm_sl = msl;
-               event.event = IB_EVENT_SM_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       if (pi->link_down_reason == 0) {
-               ppd->local_link_down_reason.sma = 0;
-               ppd->local_link_down_reason.latest = 0;
-       }
-
-       if (pi->neigh_link_down_reason == 0) {
-               ppd->neigh_link_down_reason.sma = 0;
-               ppd->neigh_link_down_reason.latest = 0;
-       }
-
-       ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
-       ppd->sa_qp = be32_to_cpu(pi->sa_qp);
-
-       ppd->port_error_action = be32_to_cpu(pi->port_error_action);
-       lwe = be16_to_cpu(pi->link_width.enabled);
-       if (lwe) {
-               if (lwe == OPA_LINK_WIDTH_RESET ||
-                   lwe == OPA_LINK_WIDTH_RESET_OLD)
-                       set_link_width_enabled(ppd, ppd->link_width_supported);
-               else if ((lwe & ~ppd->link_width_supported) == 0)
-                       set_link_width_enabled(ppd, lwe);
-               else
-                       smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
-       /* LWD.E is always applied - 0 means "disabled" */
-       if (lwe == OPA_LINK_WIDTH_RESET ||
-           lwe == OPA_LINK_WIDTH_RESET_OLD) {
-               set_link_width_downgrade_enabled(ppd,
-                                                ppd->
-                                                link_width_downgrade_supported
-                                                );
-       } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
-               /* only set and apply if something changed */
-               if (lwe != ppd->link_width_downgrade_enabled) {
-                       set_link_width_downgrade_enabled(ppd, lwe);
-                       call_link_downgrade_policy = 1;
-               }
-       } else {
-               smp->status |= IB_SMP_INVALID_FIELD;
-       }
-       lse = be16_to_cpu(pi->link_speed.enabled);
-       if (lse) {
-               if (lse & be16_to_cpu(pi->link_speed.supported))
-                       set_link_speed_enabled(ppd, lse);
-               else
-                       smp->status |= IB_SMP_INVALID_FIELD;
-       }
-
-       ibp->rvp.mkeyprot =
-               (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
-       ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
-       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
-                                   ibp->rvp.vl_high_limit);
-
-       if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
-           ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-       for (i = 0; i < ppd->vls_supported; i++) {
-               if ((i % 2) == 0)
-                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
-                                          4) & 0xF);
-               else
-                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
-                                         0xF);
-               if (mtu == 0xffff) {
-                       pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
-                               mtu,
-                               (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       mtu = hfi1_max_mtu; /* use a valid MTU */
-               }
-               if (dd->vld[i].mtu != mtu) {
-                       dd_dev_info(dd,
-                                   "MTU change on vl %d from %d to %d\n",
-                                   i, dd->vld[i].mtu, mtu);
-                       dd->vld[i].mtu = mtu;
-                       call_set_mtu++;
-               }
-       }
-       /* As per OPAV1 spec: VL15 must support and be configured
-        * for operation with a 2048 or larger MTU.
-        */
-       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
-       if (mtu < 2048 || mtu == 0xffff)
-               mtu = 2048;
-       if (dd->vld[15].mtu != mtu) {
-               dd_dev_info(dd,
-                           "MTU change on vl 15 from %d to %d\n",
-                           dd->vld[15].mtu, mtu);
-               dd->vld[15].mtu = mtu;
-               call_set_mtu++;
-       }
-       if (call_set_mtu)
-               set_mtu(ppd);
-
-       /* Set operational VLs */
-       vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
-       if (vls) {
-               if (vls > ppd->vls_supported) {
-                       pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
-                               pi->operational_vls);
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               } else {
-                       if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
-                                           vls) == -EINVAL)
-                               smp->status |= IB_SMP_INVALID_FIELD;
-               }
-       }
-
-       if (pi->mkey_violations == 0)
-               ibp->rvp.mkey_violations = 0;
-
-       if (pi->pkey_violations == 0)
-               ibp->rvp.pkey_violations = 0;
-
-       if (pi->qkey_violations == 0)
-               ibp->rvp.qkey_violations = 0;
-
-       ibp->rvp.subnet_timeout =
-               pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
-
-       crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
-       crc_enabled >>= 4;
-       crc_enabled &= 0xf;
-
-       if (crc_enabled != 0)
-               ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
-
-       ppd->is_active_optimize_enabled =
-                       !!(be16_to_cpu(pi->port_mode)
-                                       & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
-
-       ls_new = pi->port_states.portphysstate_portstate &
-                       OPA_PI_MASK_PORT_STATE;
-       ps_new = (pi->port_states.portphysstate_portstate &
-                       OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
-
-       if (ls_old == IB_PORT_INIT) {
-               if (start_of_sm_config) {
-                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
-                               ppd->is_sm_config_started = 1;
-               } else if (ls_new == IB_PORT_ARMED) {
-                       if (ppd->is_sm_config_started == 0)
-                               invalid = 1;
-               }
-       }
-
-       /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
-       if (clientrereg) {
-               event.event = IB_EVENT_CLIENT_REREGISTER;
-               ib_dispatch_event(&event);
-       }
-
-       /*
-        * Do the port state change now that the other link parameters
-        * have been set.
-        * Changing the port physical state only makes sense if the link
-        * is down or is being set to down.
-        */
-
-       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
-       if (ret)
-               return ret;
-
-       ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
-
-       /* restore re-reg bit per o14-12.2.1 */
-       pi->clientrereg_subnettimeout |= clientrereg;
-
-       /*
-        * Apply the new link downgrade policy.  This may result in a link
-        * bounce.  Do this after everything else so things are settled.
-        * Possible problem: if setting the port state above fails, then
-        * the policy change is not applied.
-        */
-       if (call_link_downgrade_policy)
-               apply_link_downgrade_policy(ppd, 0);
-
-       return ret;
-
-get_only:
-       return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
-}
-
-/**
- * set_pkeys - set the PKEY table for ctxt 0
- * @dd: the hfi1_ib device
- * @port: the IB port number
- * @pkeys: the PKEY table
- */
-static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-       int changed = 0;
-       int update_includes_mgmt_partition = 0;
-
-       /*
-        * IB port one/two always maps to context zero/one,
-        * always a kernel context, no locking needed
-        * If we get here with ppd setup, no need to check
-        * that rcd is valid.
-        */
-       ppd = dd->pport + (port - 1);
-       /*
-        * If the update does not include the management pkey, don't do it.
-        */
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
-               if (pkeys[i] == LIM_MGMT_P_KEY) {
-                       update_includes_mgmt_partition = 1;
-                       break;
-               }
-       }
-
-       if (!update_includes_mgmt_partition)
-               return 1;
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
-               u16 key = pkeys[i];
-               u16 okey = ppd->pkeys[i];
-
-               if (key == okey)
-                       continue;
-               /*
-                * The SM gives us the complete PKey table. We have
-                * to ensure that we put the PKeys in the matching
-                * slots.
-                */
-               ppd->pkeys[i] = key;
-               changed = 1;
-       }
-
-       if (changed) {
-               struct ib_event event;
-
-               (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
-
-               event.event = IB_EVENT_PKEY_CHANGE;
-               event.device = &dd->verbs_dev.rdi.ibdev;
-               event.element.port_num = port;
-               ib_dispatch_event(&event);
-       }
-       return 0;
-}
-
-static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 n_blocks_sent = OPA_AM_NBLK(am);
-       u32 start_block = am & 0x7ff;
-       u16 *p = (u16 *)data;
-       __be16 *q = (__be16 *)data;
-       int i;
-       u16 n_blocks_avail;
-       unsigned npkeys = hfi1_get_npkeys(dd);
-
-       if (n_blocks_sent == 0) {
-               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
-                       port, start_block, n_blocks_sent);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
-
-       if (start_block + n_blocks_sent > n_blocks_avail ||
-           n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
-               pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
-                       start_block, n_blocks_sent, n_blocks_avail,
-                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
-               p[i] = be16_to_cpu(q[i]);
-
-       if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
-}
-
-static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
-{
-       u64 *val = data;
-
-       *val++ = read_csr(dd, SEND_SC2VLT0);
-       *val++ = read_csr(dd, SEND_SC2VLT1);
-       *val++ = read_csr(dd, SEND_SC2VLT2);
-       *val++ = read_csr(dd, SEND_SC2VLT3);
-       return 0;
-}
-
-#define ILLEGAL_VL 12
-/*
- * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
- * for SC15, which must map to VL15). If we don't remap things this
- * way it is possible for VL15 counters to increment when we try to
- * send on a SC which is mapped to an invalid VL.
- */
-static void filter_sc2vlt(void *data)
-{
-       int i;
-       u8 *pd = data;
-
-       for (i = 0; i < OPA_MAX_SCS; i++) {
-               if (i == 15)
-                       continue;
-               if ((pd[i] & 0x1f) == 0xf)
-                       pd[i] = ILLEGAL_VL;
-       }
-}
-
-static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
-{
-       u64 *val = data;
-
-       filter_sc2vlt(data);
-
-       write_csr(dd, SEND_SC2VLT0, *val++);
-       write_csr(dd, SEND_SC2VLT1, *val++);
-       write_csr(dd, SEND_SC2VLT2, *val++);
-       write_csr(dd, SEND_SC2VLT3, *val++);
-       write_seqlock_irq(&dd->sc2vl_lock);
-       memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
-       write_sequnlock_irq(&dd->sc2vl_lock);
-       return 0;
-}
-
-static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
-       unsigned i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
-               *p++ = ibp->sl_to_sc[i];
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       int i;
-       u8 sc;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
-               sc = *p++;
-               if (ibp->sl_to_sc[i] != sc) {
-                       ibp->sl_to_sc[i] = sc;
-
-                       /* Put all stale qps into error state */
-                       hfi1_error_port_qps(ibp, i);
-               }
-       }
-
-       return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
-       unsigned i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
-               *p++ = ibp->sc_to_sl[i];
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *p = data;
-       int i;
-
-       if (am) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
-               ibp->sc_to_sl[i] = *p++;
-
-       return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *)data;
-       size_t size = 4 * sizeof(u64);
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       get_sc2vlt_tables(dd, vp);
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NBLK(am);
-       int async_update = OPA_AM_ASYNC(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       void *vp = (void *)data;
-       struct hfi1_pportdata *ppd;
-       int lstate;
-
-       if (n_blocks != 1 || async_update) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       lstate = driver_lstate(ppd);
-       /*
-        * it's known that async_update is 0 by this point, but include
-        * the explicit check for clarity
-        */
-       if (!async_update &&
-           (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       set_sc2vlt_tables(dd, vp);
-
-       return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       void *vp = (void *)data;
-       int size;
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-
-       size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
-
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       u32 n_blocks = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       void *vp = (void *)data;
-       int lstate;
-
-       if (n_blocks != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* IB numbers ports from 1, hw from 0 */
-       ppd = dd->pport + (port - 1);
-       lstate = driver_lstate(ppd);
-       if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-
-       fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
-
-       return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                        resp_len);
-}
-
-static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
-                             u32 *resp_len)
-{
-       u32 nports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 lstate;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
-
-       if (nports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ibp = to_iport(ibdev, port);
-       ppd = ppd_from_ibp(ibp);
-
-       lstate = driver_lstate(ppd);
-
-       if (start_of_sm_config && (lstate == IB_PORT_INIT))
-               ppd->is_sm_config_started = 1;
-
-#if PI_LED_ENABLE_SUP
-       psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
-       psi->port_states.ledenable_offlinereason |=
-               ppd->is_sm_config_started << 5;
-       psi->port_states.ledenable_offlinereason |=
-               ppd->offline_disabled_reason;
-#else
-       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
-
-       psi->port_states.portphysstate_portstate =
-               (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
-       psi->link_width_downgrade_tx_active =
-               cpu_to_be16(ppd->link_width_downgrade_tx_active);
-       psi->link_width_downgrade_rx_active =
-               cpu_to_be16(ppd->link_width_downgrade_rx_active);
-       if (resp_len)
-               *resp_len += sizeof(struct opa_port_state_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port,
-                             u32 *resp_len)
-{
-       u32 nports = OPA_AM_NPORT(am);
-       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
-       u32 ls_old;
-       u8 ls_new, ps_new;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
-       int ret, invalid = 0;
-
-       if (nports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ibp = to_iport(ibdev, port);
-       ppd = ppd_from_ibp(ibp);
-
-       ls_old = driver_lstate(ppd);
-
-       ls_new = port_states_to_logical_state(&psi->port_states);
-       ps_new = port_states_to_phys_state(&psi->port_states);
-
-       if (ls_old == IB_PORT_INIT) {
-               if (start_of_sm_config) {
-                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
-                               ppd->is_sm_config_started = 1;
-               } else if (ls_new == IB_PORT_ARMED) {
-                       if (ppd->is_sm_config_started == 0)
-                               invalid = 1;
-               }
-       }
-
-       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
-       if (ret)
-               return ret;
-
-       if (invalid)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
-                                    struct ib_device *ibdev, u8 port,
-                                    u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 addr = OPA_AM_CI_ADDR(am);
-       u32 len = OPA_AM_CI_LEN(am) + 1;
-       int ret;
-
-#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
-#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
-#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
-
-       /*
-        * check that addr is within spec, and
-        * addr and (addr + len - 1) are on the same "page"
-        */
-       if (addr >= 4096 ||
-           (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ret = get_cable_info(dd, port, addr, len, data);
-
-       if (ret == -ENODEV) {
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /* The address range for the CableInfo SMA query is wider than the
-        * memory available on the QSFP cable. We want to return a valid
-        * response, albeit zeroed out, for address ranges beyond available
-        * memory but that are within the CableInfo query spec
-        */
-       if (ret < 0 && ret != -ERANGE) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       if (resp_len)
-               *resp_len += len;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len)
-{
-       u32 num_ports = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *)data;
-       int size;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       ppd = dd->pport + (port - 1);
-       size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
-       trace_bct_get(dd, p);
-       if (resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-                             struct ib_device *ibdev, u8 port, u32 *resp_len)
-{
-       u32 num_ports = OPA_AM_NPORT(am);
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd;
-       struct buffer_control *p = (struct buffer_control *)data;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-       ppd = dd->pport + (port - 1);
-       trace_bct_set(dd, p);
-       if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
-}
-
-static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
-                                u32 *resp_len)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
-       u32 num_ports = OPA_AM_NPORT(am);
-       u8 section = (am & 0x00ff0000) >> 16;
-       u8 *p = data;
-       int size = 0;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       switch (section) {
-       case OPA_VLARB_LOW_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
-               break;
-       case OPA_VLARB_HIGH_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
-               break;
-       case OPA_VLARB_PREEMPT_ELEMENTS:
-               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
-               break;
-       case OPA_VLARB_PREEMPT_MATRIX:
-               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
-               break;
-       default:
-               pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
-                       be32_to_cpu(smp->attr_mod));
-               smp->status |= IB_SMP_INVALID_FIELD;
-               break;
-       }
-
-       if (size > 0 && resp_len)
-               *resp_len += size;
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
-                                struct ib_device *ibdev, u8 port,
-                                u32 *resp_len)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
-       u32 num_ports = OPA_AM_NPORT(am);
-       u8 section = (am & 0x00ff0000) >> 16;
-       u8 *p = data;
-
-       if (num_ports != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       switch (section) {
-       case OPA_VLARB_LOW_ELEMENTS:
-               (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
-               break;
-       case OPA_VLARB_HIGH_ELEMENTS:
-               (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
-               break;
-       /*
-        * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
-        * can be changed from the default values
-        */
-       case OPA_VLARB_PREEMPT_ELEMENTS:
-               /* FALLTHROUGH */
-       case OPA_VLARB_PREEMPT_MATRIX:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               break;
-       default:
-               pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
-                       be32_to_cpu(smp->attr_mod));
-               smp->status |= IB_SMP_INVALID_FIELD;
-               break;
-       }
-
-       return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
-}
-
-struct opa_pma_mad {
-       struct ib_mad_hdr mad_hdr;
-       u8 data[2024];
-} __packed;
-
-struct opa_class_port_info {
-       u8 base_version;
-       u8 class_version;
-       __be16 cap_mask;
-       __be32 cap_mask2_resp_time;
-
-       u8 redirect_gid[16];
-       __be32 redirect_tc_fl;
-       __be32 redirect_lid;
-       __be32 redirect_sl_qp;
-       __be32 redirect_qkey;
-
-       u8 trap_gid[16];
-       __be32 trap_tc_fl;
-       __be32 trap_lid;
-       __be32 trap_hl_qp;
-       __be32 trap_qkey;
-
-       __be16 trap_pkey;
-       __be16 redirect_pkey;
-
-       u8 trap_sl_rsvd;
-       u8 reserved[3];
-} __packed;
-
-struct opa_port_status_req {
-       __u8 port_num;
-       __u8 reserved[3];
-       __be32 vl_select_mask;
-};
-
-#define VL_MASK_ALL            0x000080ff
-
-struct opa_port_status_rsp {
-       __u8 port_num;
-       __u8 reserved[3];
-       __be32  vl_select_mask;
-
-       /* Data counters */
-       __be64 port_xmit_data;
-       __be64 port_rcv_data;
-       __be64 port_xmit_pkts;
-       __be64 port_rcv_pkts;
-       __be64 port_multicast_xmit_pkts;
-       __be64 port_multicast_rcv_pkts;
-       __be64 port_xmit_wait;
-       __be64 sw_port_congestion;
-       __be64 port_rcv_fecn;
-       __be64 port_rcv_becn;
-       __be64 port_xmit_time_cong;
-       __be64 port_xmit_wasted_bw;
-       __be64 port_xmit_wait_data;
-       __be64 port_rcv_bubble;
-       __be64 port_mark_fecn;
-       /* Error counters */
-       __be64 port_rcv_constraint_errors;
-       __be64 port_rcv_switch_relay_errors;
-       __be64 port_xmit_discards;
-       __be64 port_xmit_constraint_errors;
-       __be64 port_rcv_remote_physical_errors;
-       __be64 local_link_integrity_errors;
-       __be64 port_rcv_errors;
-       __be64 excessive_buffer_overruns;
-       __be64 fm_config_errors;
-       __be32 link_error_recovery;
-       __be32 link_downed;
-       u8 uncorrectable_errors;
-
-       u8 link_quality_indicator; /* 5res, 3bit */
-       u8 res2[6];
-       struct _vls_pctrs {
-               /* per-VL Data counters */
-               __be64 port_vl_xmit_data;
-               __be64 port_vl_rcv_data;
-               __be64 port_vl_xmit_pkts;
-               __be64 port_vl_rcv_pkts;
-               __be64 port_vl_xmit_wait;
-               __be64 sw_port_vl_congestion;
-               __be64 port_vl_rcv_fecn;
-               __be64 port_vl_rcv_becn;
-               __be64 port_xmit_time_cong;
-               __be64 port_vl_xmit_wasted_bw;
-               __be64 port_vl_xmit_wait_data;
-               __be64 port_vl_rcv_bubble;
-               __be64 port_vl_mark_fecn;
-               __be64 port_vl_xmit_discards;
-       } vls[0]; /* real array size defined by # bits set in vl_select_mask */
-};
-
-enum counter_selects {
-       CS_PORT_XMIT_DATA                       = (1 << 31),
-       CS_PORT_RCV_DATA                        = (1 << 30),
-       CS_PORT_XMIT_PKTS                       = (1 << 29),
-       CS_PORT_RCV_PKTS                        = (1 << 28),
-       CS_PORT_MCAST_XMIT_PKTS                 = (1 << 27),
-       CS_PORT_MCAST_RCV_PKTS                  = (1 << 26),
-       CS_PORT_XMIT_WAIT                       = (1 << 25),
-       CS_SW_PORT_CONGESTION                   = (1 << 24),
-       CS_PORT_RCV_FECN                        = (1 << 23),
-       CS_PORT_RCV_BECN                        = (1 << 22),
-       CS_PORT_XMIT_TIME_CONG                  = (1 << 21),
-       CS_PORT_XMIT_WASTED_BW                  = (1 << 20),
-       CS_PORT_XMIT_WAIT_DATA                  = (1 << 19),
-       CS_PORT_RCV_BUBBLE                      = (1 << 18),
-       CS_PORT_MARK_FECN                       = (1 << 17),
-       CS_PORT_RCV_CONSTRAINT_ERRORS           = (1 << 16),
-       CS_PORT_RCV_SWITCH_RELAY_ERRORS         = (1 << 15),
-       CS_PORT_XMIT_DISCARDS                   = (1 << 14),
-       CS_PORT_XMIT_CONSTRAINT_ERRORS          = (1 << 13),
-       CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS      = (1 << 12),
-       CS_LOCAL_LINK_INTEGRITY_ERRORS          = (1 << 11),
-       CS_PORT_RCV_ERRORS                      = (1 << 10),
-       CS_EXCESSIVE_BUFFER_OVERRUNS            = (1 << 9),
-       CS_FM_CONFIG_ERRORS                     = (1 << 8),
-       CS_LINK_ERROR_RECOVERY                  = (1 << 7),
-       CS_LINK_DOWNED                          = (1 << 6),
-       CS_UNCORRECTABLE_ERRORS                 = (1 << 5),
-};
-
-struct opa_clear_port_status {
-       __be64 port_select_mask[4];
-       __be32 counter_select_mask;
-};
-
-struct opa_aggregate {
-       __be16 attr_id;
-       __be16 err_reqlength;   /* 1 bit, 8 res, 7 bit */
-       __be32 attr_mod;
-       u8 data[0];
-};
-
-#define MSK_LLI 0x000000f0
-#define MSK_LLI_SFT 4
-#define MSK_LER 0x0000000f
-#define MSK_LER_SFT 0
-#define ADD_LLI 8
-#define ADD_LER 2
-
-/* Request contains first three fields, response contains those plus the rest */
-struct opa_port_data_counters_msg {
-       __be64 port_select_mask[4];
-       __be32 vl_select_mask;
-       __be32 resolution;
-
-       /* Response fields follow */
-       struct _port_dctrs {
-               u8 port_number;
-               u8 reserved2[3];
-               __be32 link_quality_indicator; /* 29res, 3bit */
-
-               /* Data counters */
-               __be64 port_xmit_data;
-               __be64 port_rcv_data;
-               __be64 port_xmit_pkts;
-               __be64 port_rcv_pkts;
-               __be64 port_multicast_xmit_pkts;
-               __be64 port_multicast_rcv_pkts;
-               __be64 port_xmit_wait;
-               __be64 sw_port_congestion;
-               __be64 port_rcv_fecn;
-               __be64 port_rcv_becn;
-               __be64 port_xmit_time_cong;
-               __be64 port_xmit_wasted_bw;
-               __be64 port_xmit_wait_data;
-               __be64 port_rcv_bubble;
-               __be64 port_mark_fecn;
-
-               __be64 port_error_counter_summary;
-               /* Sum of error counts/port */
-
-               struct _vls_dctrs {
-                       /* per-VL Data counters */
-                       __be64 port_vl_xmit_data;
-                       __be64 port_vl_rcv_data;
-                       __be64 port_vl_xmit_pkts;
-                       __be64 port_vl_rcv_pkts;
-                       __be64 port_vl_xmit_wait;
-                       __be64 sw_port_vl_congestion;
-                       __be64 port_vl_rcv_fecn;
-                       __be64 port_vl_rcv_becn;
-                       __be64 port_xmit_time_cong;
-                       __be64 port_vl_xmit_wasted_bw;
-                       __be64 port_vl_xmit_wait_data;
-                       __be64 port_vl_rcv_bubble;
-                       __be64 port_vl_mark_fecn;
-               } vls[0];
-               /* array size defined by #bits set in vl_select_mask*/
-       } port[1]; /* array size defined by  #ports in attribute modifier */
-};
-
-struct opa_port_error_counters64_msg {
-       /*
-        * Request contains first two fields, response contains the
-        * whole magilla
-        */
-       __be64 port_select_mask[4];
-       __be32 vl_select_mask;
-
-       /* Response-only fields follow */
-       __be32 reserved1;
-       struct _port_ectrs {
-               u8 port_number;
-               u8 reserved2[7];
-               __be64 port_rcv_constraint_errors;
-               __be64 port_rcv_switch_relay_errors;
-               __be64 port_xmit_discards;
-               __be64 port_xmit_constraint_errors;
-               __be64 port_rcv_remote_physical_errors;
-               __be64 local_link_integrity_errors;
-               __be64 port_rcv_errors;
-               __be64 excessive_buffer_overruns;
-               __be64 fm_config_errors;
-               __be32 link_error_recovery;
-               __be32 link_downed;
-               u8 uncorrectable_errors;
-               u8 reserved3[7];
-               struct _vls_ectrs {
-                       __be64 port_vl_xmit_discards;
-               } vls[0];
-               /* array size defined by #bits set in vl_select_mask */
-       } port[1]; /* array size defined by #ports in attribute modifier */
-};
-
-struct opa_port_error_info_msg {
-       __be64 port_select_mask[4];
-       __be32 error_info_select_mask;
-       __be32 reserved1;
-       struct _port_ei {
-               u8 port_number;
-               u8 reserved2[7];
-
-               /* PortRcvErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       union {
-                               u8 raw[17];
-                               struct {
-                                       /* EI1to12 format */
-                                       u8 packet_flit1[8];
-                                       u8 packet_flit2[8];
-                                       u8 remaining_flit_bits12;
-                               } ei1to12;
-                               struct {
-                                       u8 packet_bytes[8];
-                                       u8 remaining_flit_bits;
-                               } ei13;
-                       } ei;
-                       u8 reserved3[6];
-               } __packed port_rcv_ei;
-
-               /* ExcessiveBufferOverrunInfo */
-               struct {
-                       u8 status_and_sc;
-                       u8 reserved4[7];
-               } __packed excessive_buffer_overrun_ei;
-
-               /* PortXmitConstraintErrorInfo */
-               struct {
-                       u8 status;
-                       u8 reserved5;
-                       __be16 pkey;
-                       __be32 slid;
-               } __packed port_xmit_constraint_ei;
-
-               /* PortRcvConstraintErrorInfo */
-               struct {
-                       u8 status;
-                       u8 reserved6;
-                       __be16 pkey;
-                       __be32 slid;
-               } __packed port_rcv_constraint_ei;
-
-               /* PortRcvSwitchRelayErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 reserved7[3];
-                       __u32 error_info;
-               } __packed port_rcv_switch_relay_ei;
-
-               /* UncorrectableErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 reserved8;
-               } __packed uncorrectable_ei;
-
-               /* FMConfigErrorInfo */
-               struct {
-                       u8 status_and_code;
-                       u8 error_info;
-               } __packed fm_config_ei;
-               __u32 reserved9;
-       } port[1]; /* actual array size defined by #ports in attr modifier */
-};
-
-/* opa_port_error_info_msg error_info_select_mask bit definitions */
-enum error_info_selects {
-       ES_PORT_RCV_ERROR_INFO                  = (1 << 31),
-       ES_EXCESSIVE_BUFFER_OVERRUN_INFO        = (1 << 30),
-       ES_PORT_XMIT_CONSTRAINT_ERROR_INFO      = (1 << 29),
-       ES_PORT_RCV_CONSTRAINT_ERROR_INFO       = (1 << 28),
-       ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO     = (1 << 27),
-       ES_UNCORRECTABLE_ERROR_INFO             = (1 << 26),
-       ES_FM_CONFIG_ERROR_INFO                 = (1 << 25)
-};
-
-static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
-                                    struct ib_device *ibdev, u32 *resp_len)
-{
-       struct opa_class_port_info *p =
-               (struct opa_class_port_info *)pmp->data;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       if (pmp->mad_hdr.attr_mod != 0)
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       p->base_version = OPA_MGMT_BASE_VERSION;
-       p->class_version = OPA_SMI_CLASS_VERSION;
-       /*
-        * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
-        */
-       p->cap_mask2_resp_time = cpu_to_be32(18);
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static void a0_portstatus(struct hfi1_pportdata *ppd,
-                         struct opa_port_status_rsp *rsp, u32 vl_select_mask)
-{
-       if (!is_bx(ppd->dd)) {
-               unsigned long vl;
-               u64 sum_vl_xmit_wait = 0;
-               u32 vl_all_mask = VL_MASK_ALL;
-
-               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-                                8 * sizeof(vl_all_mask)) {
-                       u64 tmp = sum_vl_xmit_wait +
-                                 read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                idx_from_vl(vl));
-                       if (tmp < sum_vl_xmit_wait) {
-                               /* we wrapped */
-                               sum_vl_xmit_wait = (u64)~0;
-                               break;
-                       }
-                       sum_vl_xmit_wait = tmp;
-               }
-               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
-                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
-       }
-}
-
-static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       struct opa_port_status_req *req =
-               (struct opa_port_status_req *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_port_status_rsp *rsp;
-       u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       unsigned long vl;
-       size_t response_data_size;
-       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       u8 port_num = req->port_num;
-       u8 num_vls = hweight32(vl_select_mask);
-       struct _vls_pctrs *vlinfo;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       int vfi;
-       u64 tmp, tmp2;
-
-       response_data_size = sizeof(struct opa_port_status_rsp) +
-                               num_vls * sizeof(struct _vls_pctrs);
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       if (nports != 1 || (port_num && port_num != port) ||
-           num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       rsp = (struct opa_port_status_rsp *)pmp->data;
-       if (port_num)
-               rsp->port_num = port_num;
-       else
-               rsp->port_num = port;
-
-       rsp->port_rcv_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-
-       hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
-
-       rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
-       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
-                                        CNTR_INVALID_VL));
-       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
-                                        CNTR_INVALID_VL));
-       rsp->port_multicast_xmit_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_multicast_rcv_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_xmit_wait =
-               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
-       rsp->port_rcv_fecn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
-       rsp->port_rcv_becn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
-       rsp->port_xmit_discards =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                          CNTR_INVALID_VL));
-       rsp->port_xmit_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                  CNTR_INVALID_VL);
-       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->link_error_recovery = cpu_to_be32(~0);
-       } else {
-               rsp->link_error_recovery = cpu_to_be32(tmp2);
-       }
-       rsp->port_rcv_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
-       rsp->excessive_buffer_overruns =
-               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
-       rsp->fm_config_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                                     CNTR_INVALID_VL));
-
-       /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       /* The vl_select_mask has been checked above, and we know
-        * that it contains only entries which represent valid VLs.
-        * So in the for_each_set_bit() loop below, we don't need
-        * any additional checks for vl.
-        */
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-
-               tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
-               rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
-
-               rsp->vls[vfi].port_vl_rcv_pkts =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_data =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_pkts =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_wait =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_fecn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_becn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                                 idx_from_vl(vl)));
-
-               vlinfo++;
-               vfi++;
-       }
-
-       a0_portstatus(ppd, rsp, vl_select_mask);
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
-                                    u8 res_lli, u8 res_ler)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u64 error_counter_summary = 0, tmp;
-
-       error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                               CNTR_INVALID_VL);
-       /* port_rcv_switch_relay_errors is 0 for HFIs */
-       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                               CNTR_INVALID_VL);
-       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                               CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                              CNTR_INVALID_VL);
-       /* local link integrity must be right-shifted by the lli resolution */
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_lli);
-       /* link error recovery must b right-shifted by the ler resolution */
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_ler);
-       error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
-                                              CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
-       error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                              CNTR_INVALID_VL);
-       /* ppd->link_downed is a 32-bit value */
-       error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                               CNTR_INVALID_VL);
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-       /* this is an 8-bit quantity */
-       error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       return error_counter_summary;
-}
-
-static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
-                           u32 vl_select_mask)
-{
-       if (!is_bx(ppd->dd)) {
-               unsigned long vl;
-               u64 sum_vl_xmit_wait = 0;
-               u32 vl_all_mask = VL_MASK_ALL;
-
-               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
-                                8 * sizeof(vl_all_mask)) {
-                       u64 tmp = sum_vl_xmit_wait +
-                                 read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                idx_from_vl(vl));
-                       if (tmp < sum_vl_xmit_wait) {
-                               /* we wrapped */
-                               sum_vl_xmit_wait = (u64)~0;
-                               break;
-                       }
-                       sum_vl_xmit_wait = tmp;
-               }
-               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
-                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
-       }
-}
-
-static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
-                                  struct _port_dctrs *rsp)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-
-       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
-                                               CNTR_INVALID_VL));
-       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
-                                               CNTR_INVALID_VL));
-       rsp->port_multicast_xmit_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
-                                         CNTR_INVALID_VL));
-       rsp->port_multicast_rcv_pkts =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
-                                         CNTR_INVALID_VL));
-}
-
-static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
-                                   struct ib_device *ibdev,
-                                   u8 port, u32 *resp_len)
-{
-       struct opa_port_data_counters_msg *req =
-               (struct opa_port_data_counters_msg *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct _port_dctrs *rsp;
-       struct _vls_dctrs *vlinfo;
-       size_t response_data_size;
-       u32 num_ports;
-       u8 num_pslm;
-       u8 lq, num_vls;
-       u8 res_lli, res_ler;
-       u64 port_mask;
-       unsigned long port_num;
-       unsigned long vl;
-       u32 vl_select_mask;
-       int vfi;
-
-       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
-       vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
-       res_lli = res_lli ? res_lli + ADD_LLI : 0;
-       res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
-       res_ler = res_ler ? res_ler + ADD_LER : 0;
-
-       if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* Sanity check */
-       response_data_size = sizeof(struct opa_port_data_counters_msg) +
-                               num_vls * sizeof(struct _vls_dctrs);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the
-        * port the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if ((u8)port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       rsp = &req->port[0];
-       memset(rsp, 0, sizeof(*rsp));
-
-       rsp->port_number = port;
-       /*
-        * Note that link_quality_indicator is a 32 bit quantity in
-        * 'datacounters' queries (as opposed to 'portinfo' queries,
-        * where it's a byte).
-        */
-       hfi1_read_link_quality(dd, &lq);
-       rsp->link_quality_indicator = cpu_to_be32((u32)lq);
-       pma_get_opa_port_dctrs(ibdev, rsp);
-
-       rsp->port_xmit_wait =
-               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
-       rsp->port_rcv_fecn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
-       rsp->port_rcv_becn =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
-       rsp->port_error_counter_summary =
-               cpu_to_be64(get_error_counter_summary(ibdev, port,
-                                                     res_lli, res_ler));
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       /* The vl_select_mask has been checked above, and we know
-        * that it contains only entries which represent valid VLs.
-        * So in the for_each_set_bit() loop below, we don't need
-        * any additional checks for vl.
-        */
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(req->vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-
-               rsp->vls[vfi].port_vl_xmit_data =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_data =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_pkts =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_pkts =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
-                                                 idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_xmit_wait =
-                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
-                                                  idx_from_vl(vl)));
-
-               rsp->vls[vfi].port_vl_rcv_fecn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
-                                                 idx_from_vl(vl)));
-               rsp->vls[vfi].port_vl_rcv_becn =
-                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
-                                                 idx_from_vl(vl)));
-
-               /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
-               /* rsp->port_vl_xmit_wasted_bw ??? */
-               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
-                * does this differ from rsp->vls[vfi].port_vl_xmit_wait
-                */
-               /*rsp->vls[vfi].port_vl_mark_fecn =
-                *      cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
-                *              + offset));
-                */
-               vlinfo++;
-               vfi++;
-       }
-
-       a0_datacounters(ppd, rsp, vl_select_mask);
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
-                                      struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
-                                               pmp->data;
-       struct _port_dctrs rsp;
-
-       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               goto bail;
-       }
-
-       memset(&rsp, 0, sizeof(rsp));
-       pma_get_opa_port_dctrs(ibdev, &rsp);
-
-       p->port_xmit_data = rsp.port_xmit_data;
-       p->port_rcv_data = rsp.port_rcv_data;
-       p->port_xmit_packets = rsp.port_xmit_pkts;
-       p->port_rcv_packets = rsp.port_rcv_pkts;
-       p->port_unicast_xmit_packets = 0;
-       p->port_unicast_rcv_packets =  0;
-       p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
-       p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
-
-bail:
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
-                                  struct _port_ectrs *rsp, u8 port)
-{
-       u64 tmp, tmp2;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                                       CNTR_INVALID_VL);
-       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->link_error_recovery = cpu_to_be32(~0);
-       } else {
-               rsp->link_error_recovery = cpu_to_be32(tmp2);
-       }
-
-       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
-                                               CNTR_INVALID_VL));
-       rsp->port_rcv_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->port_rcv_switch_relay_errors = 0;
-       rsp->port_xmit_discards =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
-                                          CNTR_INVALID_VL));
-       rsp->port_xmit_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       rsp->port_rcv_constraint_errors =
-               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
-                                          CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
-       rsp->excessive_buffer_overruns =
-               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
-}
-
-static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       size_t response_data_size;
-       struct _port_ectrs *rsp;
-       u8 port_num;
-       struct opa_port_error_counters64_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u32 num_ports;
-       u8 num_pslm;
-       u8 num_vls;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct _vls_ectrs *vlinfo;
-       unsigned long vl;
-       u64 port_mask, tmp;
-       u32 vl_select_mask;
-       int vfi;
-
-       req = (struct opa_port_error_counters64_msg *)pmp->data;
-
-       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
-                               num_vls * sizeof(struct _vls_ectrs);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-       /*
-        * The bit set in the mask needs to be consistent with the
-        * port the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       rsp = &req->port[0];
-
-       ibp = to_iport(ibdev, port_num);
-       ppd = ppd_from_ibp(ibp);
-
-       memset(rsp, 0, sizeof(*rsp));
-       rsp->port_number = port_num;
-
-       pma_get_opa_port_ectrs(ibdev, rsp, port_num);
-
-       rsp->port_rcv_remote_physical_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
-                                         CNTR_INVALID_VL));
-       rsp->fm_config_errors =
-               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
-                                         CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
-
-       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
-       vlinfo = &rsp->vls[0];
-       vfi = 0;
-       vl_select_mask = be32_to_cpu(req->vl_select_mask);
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(req->vl_select_mask)) {
-               memset(vlinfo, 0, sizeof(*vlinfo));
-               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
-               vlinfo += 1;
-               vfi++;
-       }
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
-                                  struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct _port_ectrs rsp;
-       u64 temp_link_overrun_errors;
-       u64 temp_64;
-       u32 temp_32;
-
-       memset(&rsp, 0, sizeof(rsp));
-       pma_get_opa_port_ectrs(ibdev, &rsp, port);
-
-       if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               goto bail;
-       }
-
-       p->symbol_error_counter = 0; /* N/A for OPA */
-
-       temp_32 = be32_to_cpu(rsp.link_error_recovery);
-       if (temp_32 > 0xFFUL)
-               p->link_error_recovery_counter = 0xFF;
-       else
-               p->link_error_recovery_counter = (u8)temp_32;
-
-       temp_32 = be32_to_cpu(rsp.link_downed);
-       if (temp_32 > 0xFFUL)
-               p->link_downed_counter = 0xFF;
-       else
-               p->link_downed_counter = (u8)temp_32;
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_errors);
-       if (temp_64 > 0xFFFFUL)
-               p->port_rcv_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
-       if (temp_64 > 0xFFFFUL)
-               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
-       p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_xmit_discards);
-       if (temp_64 > 0xFFFFUL)
-               p->port_xmit_discards = cpu_to_be16(0xFFFF);
-       else
-               p->port_xmit_discards = cpu_to_be16((u16)temp_64);
-
-       temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
-       if (temp_64 > 0xFFUL)
-               p->port_xmit_constraint_errors = 0xFF;
-       else
-               p->port_xmit_constraint_errors = (u8)temp_64;
-
-       temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
-       if (temp_64 > 0xFFUL)
-               p->port_rcv_constraint_errors = 0xFFUL;
-       else
-               p->port_rcv_constraint_errors = (u8)temp_64;
-
-       /* LocalLink: 7:4, BufferOverrun: 3:0 */
-       temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
-       if (temp_64 > 0xFUL)
-               temp_64 = 0xFUL;
-
-       temp_link_overrun_errors = temp_64 << 4;
-
-       temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
-       if (temp_64 > 0xFUL)
-               temp_64 = 0xFUL;
-       temp_link_overrun_errors |= temp_64;
-
-       p->link_overrun_errors = (u8)temp_link_overrun_errors;
-
-       p->vl15_dropped = 0; /* N/A for OPA */
-
-bail:
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
-                                struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
-{
-       size_t response_data_size;
-       struct _port_ei *rsp;
-       struct opa_port_error_info_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u64 port_mask;
-       u32 num_ports;
-       u8 port_num;
-       u8 num_pslm;
-       u64 reg;
-
-       req = (struct opa_port_error_info_msg *)pmp->data;
-       rsp = &req->port[0];
-
-       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-
-       memset(rsp, 0, sizeof(*rsp));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* Sanity check */
-       response_data_size = sizeof(struct opa_port_error_info_msg);
-
-       if (response_data_size > sizeof(pmp->data)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the port
-        * the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /* PortRcvErrorInfo */
-       rsp->port_rcv_ei.status_and_code =
-               dd->err_info_rcvport.status_and_code;
-       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
-              &dd->err_info_rcvport.packet_flit1, sizeof(u64));
-       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
-              &dd->err_info_rcvport.packet_flit2, sizeof(u64));
-
-       /* ExcessiverBufferOverrunInfo */
-       reg = read_csr(dd, RCV_ERR_INFO);
-       if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
-               /*
-                * if the RcvExcessBufferOverrun bit is set, save SC of
-                * first pkt that encountered an excess buffer overrun
-                */
-               u8 tmp = (u8)reg;
-
-               tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
-               tmp <<= 2;
-               rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
-               /* set the status bit */
-               rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
-       }
-
-       rsp->port_xmit_constraint_ei.status =
-               dd->err_info_xmit_constraint.status;
-       rsp->port_xmit_constraint_ei.pkey =
-               cpu_to_be16(dd->err_info_xmit_constraint.pkey);
-       rsp->port_xmit_constraint_ei.slid =
-               cpu_to_be32(dd->err_info_xmit_constraint.slid);
-
-       rsp->port_rcv_constraint_ei.status =
-               dd->err_info_rcv_constraint.status;
-       rsp->port_rcv_constraint_ei.pkey =
-               cpu_to_be16(dd->err_info_rcv_constraint.pkey);
-       rsp->port_rcv_constraint_ei.slid =
-               cpu_to_be32(dd->err_info_rcv_constraint.slid);
-
-       /* UncorrectableErrorInfo */
-       rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
-
-       /* FMConfigErrorInfo */
-       rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
-
-       if (resp_len)
-               *resp_len += response_data_size;
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
-                                 struct ib_device *ibdev,
-                                 u8 port, u32 *resp_len)
-{
-       struct opa_clear_port_status *req =
-               (struct opa_clear_port_status *)pmp->data;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
-       u64 portn = be64_to_cpu(req->port_select_mask[3]);
-       u32 counter_select = be32_to_cpu(req->counter_select_mask);
-       u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
-       unsigned long vl;
-
-       if ((nports != 1) || (portn != 1 << port)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-       /*
-        * only counters returned by pma_get_opa_portstatus() are
-        * handled, so when pma_get_opa_portstatus() gets a fix,
-        * the corresponding change should be made here as well.
-        */
-
-       if (counter_select & CS_PORT_XMIT_DATA)
-               write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_DATA)
-               write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_PKTS)
-               write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_PKTS)
-               write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
-               write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_MCAST_RCV_PKTS)
-               write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_WAIT)
-               write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_sw_portCongestion for HFIs */
-
-       if (counter_select & CS_PORT_RCV_FECN)
-               write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_BECN)
-               write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_port_xmit_time_cong for HFIs */
-       /* ignore cs_port_xmit_wasted_bw for now */
-       /* ignore cs_port_xmit_wait_data for now */
-       if (counter_select & CS_PORT_RCV_BUBBLE)
-               write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
-
-       /* Only applicable for switch */
-       /* if (counter_select & CS_PORT_MARK_FECN)
-        *      write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
-        */
-
-       if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
-               write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
-
-       /* ignore cs_port_rcv_switch_relay_errors for HFIs */
-       if (counter_select & CS_PORT_XMIT_DISCARDS)
-               write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
-               write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
-               write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
-               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
-               write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       }
-
-       if (counter_select & CS_LINK_ERROR_RECOVERY) {
-               write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
-               write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
-                              CNTR_INVALID_VL, 0);
-       }
-
-       if (counter_select & CS_PORT_RCV_ERRORS)
-               write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
-               write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
-               dd->rcv_ovfl_cnt = 0;
-       }
-
-       if (counter_select & CS_FM_CONFIG_ERRORS)
-               write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_LINK_DOWNED)
-               write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
-
-       if (counter_select & CS_UNCORRECTABLE_ERRORS)
-               write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
-
-       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
-                        8 * sizeof(vl_select_mask)) {
-               if (counter_select & CS_PORT_XMIT_DATA)
-                       write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_DATA)
-                       write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_XMIT_PKTS)
-                       write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_PKTS)
-                       write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_XMIT_WAIT)
-                       write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
-
-               /* sw_port_vl_congestion is 0 for HFIs */
-               if (counter_select & CS_PORT_RCV_FECN)
-                       write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
-
-               if (counter_select & CS_PORT_RCV_BECN)
-                       write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
-
-               /* port_vl_xmit_time_cong is 0 for HFIs */
-               /* port_vl_xmit_wasted_bw ??? */
-               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
-               if (counter_select & CS_PORT_RCV_BUBBLE)
-                       write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
-
-               /* if (counter_select & CS_PORT_MARK_FECN)
-                *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
-                */
-               /* port_vl_xmit_discards ??? */
-       }
-
-       if (resp_len)
-               *resp_len += sizeof(*req);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
-                                struct ib_device *ibdev,
-                                u8 port, u32 *resp_len)
-{
-       struct _port_ei *rsp;
-       struct opa_port_error_info_msg *req;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       u64 port_mask;
-       u32 num_ports;
-       u8 port_num;
-       u8 num_pslm;
-       u32 error_info_select;
-
-       req = (struct opa_port_error_info_msg *)pmp->data;
-       rsp = &req->port[0];
-
-       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
-       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
-
-       memset(rsp, 0, sizeof(*rsp));
-
-       if (num_ports != 1 || num_ports != num_pslm) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       /*
-        * The bit set in the mask needs to be consistent with the port
-        * the request came in on.
-        */
-       port_mask = be64_to_cpu(req->port_select_mask[3]);
-       port_num = find_first_bit((unsigned long *)&port_mask,
-                                 sizeof(port_mask));
-
-       if (port_num != port) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       error_info_select = be32_to_cpu(req->error_info_select_mask);
-
-       /* PortRcvErrorInfo */
-       if (error_info_select & ES_PORT_RCV_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
-
-       /* ExcessiverBufferOverrunInfo */
-       if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
-               /*
-                * status bit is essentially kept in the h/w - bit 5 of
-                * RCV_ERR_INFO
-                */
-               write_csr(dd, RCV_ERR_INFO,
-                         RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
-
-       if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
-               dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
-
-       if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
-               dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
-
-       /* UncorrectableErrorInfo */
-       if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
-
-       /* FMConfigErrorInfo */
-       if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
-               /* turn off status bit */
-               dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
-
-       if (resp_len)
-               *resp_len += sizeof(*req);
-
-       return reply((struct ib_mad_hdr *)pmp);
-}
-
-struct opa_congestion_info_attr {
-       __be16 congestion_info;
-       u8 control_table_cap;   /* Multiple of 64 entry unit CCTs */
-       u8 congestion_log_length;
-} __packed;
-
-static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
-                                   struct ib_device *ibdev, u8 port,
-                                   u32 *resp_len)
-{
-       struct opa_congestion_info_attr *p =
-               (struct opa_congestion_info_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       p->congestion_info = 0;
-       p->control_table_cap = ppd->cc_max_table_entries;
-       p->congestion_log_length = OPA_CONG_LOG_ELEMS;
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
-                                      u8 *data, struct ib_device *ibdev,
-                                      u8 port, u32 *resp_len)
-{
-       int i;
-       struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_congestion_setting_entry_shadow *entries;
-       struct cc_state *cc_state;
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       entries = cc_state->cong_setting.entries;
-       p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
-       p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               p->entries[i].ccti_increase = entries[i].ccti_increase;
-               p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
-               p->entries[i].trigger_threshold =
-                       entries[i].trigger_threshold;
-               p->entries[i].ccti_min = entries[i].ccti_min;
-       }
-
-       rcu_read_unlock();
-
-       if (resp_len)
-               *resp_len += sizeof(*p);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
-                                      struct ib_device *ibdev, u8 port,
-                                      u32 *resp_len)
-{
-       struct opa_congestion_setting_attr *p =
-               (struct opa_congestion_setting_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_congestion_setting_entry_shadow *entries;
-       int i;
-
-       ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
-
-       entries = ppd->congestion_entries;
-       for (i = 0; i < OPA_MAX_SLS; i++) {
-               entries[i].ccti_increase = p->entries[i].ccti_increase;
-               entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
-               entries[i].trigger_threshold =
-                       p->entries[i].trigger_threshold;
-               entries[i].ccti_min = p->entries[i].ccti_min;
-       }
-
-       return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
-                                          resp_len);
-}
-
-static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
-                                       u8 *data, struct ib_device *ibdev,
-                                       u8 port, u32 *resp_len)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
-       s64 ts;
-       int i;
-
-       if (am != 0) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       spin_lock_irq(&ppd->cc_log_lock);
-
-       cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
-       cong_log->congestion_flags = 0;
-       cong_log->threshold_event_counter =
-               cpu_to_be16(ppd->threshold_event_counter);
-       memcpy(cong_log->threshold_cong_event_map,
-              ppd->threshold_cong_event_map,
-              sizeof(cong_log->threshold_cong_event_map));
-       /* keep timestamp in units of 1.024 usec */
-       ts = ktime_to_ns(ktime_get()) / 1024;
-       cong_log->current_time_stamp = cpu_to_be32(ts);
-       for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
-               struct opa_hfi1_cong_log_event_internal *cce =
-                       &ppd->cc_events[ppd->cc_mad_idx++];
-               if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
-                       ppd->cc_mad_idx = 0;
-               /*
-                * Entries which are older than twice the time
-                * required to wrap the counter are supposed to
-                * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
-                */
-               if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
-                       continue;
-               memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
-               memcpy(cong_log->events[i].remote_qp_number_cn_entry,
-                      &cce->rqpn, 3);
-               cong_log->events[i].sl_svc_type_cn_entry =
-                       ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
-               cong_log->events[i].remote_lid_cn_entry =
-                       cpu_to_be32(cce->rlid);
-               cong_log->events[i].timestamp_cn_entry =
-                       cpu_to_be32(cce->timestamp);
-       }
-
-       /*
-        * Reset threshold_cong_event_map, and threshold_event_counter
-        * to 0 when log is read.
-        */
-       memset(ppd->threshold_cong_event_map, 0x0,
-              sizeof(ppd->threshold_cong_event_map));
-       ppd->threshold_event_counter = 0;
-
-       spin_unlock_irq(&ppd->cc_log_lock);
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_hfi1_cong_log);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct ib_cc_table_attr *cc_table_attr =
-               (struct ib_cc_table_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 start_block = OPA_AM_START_BLK(am);
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct ib_cc_table_entry_shadow *entries;
-       int i, j;
-       u32 sentry, eentry;
-       struct cc_state *cc_state;
-
-       /* sanity check n_blocks, start_block */
-       if (n_blocks == 0 ||
-           start_block + n_blocks > ppd->cc_max_table_entries) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       rcu_read_lock();
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state) {
-               rcu_read_unlock();
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       sentry = start_block * IB_CCT_ENTRIES;
-       eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
-
-       cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
-
-       entries = cc_state->cct.entries;
-
-       /* return n_blocks, though the last block may not be full */
-       for (j = 0, i = sentry; i < eentry; j++, i++)
-               cc_table_attr->ccti_entries[j].entry =
-                       cpu_to_be16(entries[i].entry);
-
-       rcu_read_unlock();
-
-       if (resp_len)
-               *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-void cc_state_reclaim(struct rcu_head *rcu)
-{
-       struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
-
-       kfree(cc_state);
-}
-
-static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u32 start_block = OPA_AM_START_BLK(am);
-       u32 n_blocks = OPA_AM_NBLK(am);
-       struct ib_cc_table_entry_shadow *entries;
-       int i, j;
-       u32 sentry, eentry;
-       u16 ccti_limit;
-       struct cc_state *old_cc_state, *new_cc_state;
-
-       /* sanity check n_blocks, start_block */
-       if (n_blocks == 0 ||
-           start_block + n_blocks > ppd->cc_max_table_entries) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       sentry = start_block * IB_CCT_ENTRIES;
-       eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
-                (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
-
-       /* sanity check ccti_limit */
-       ccti_limit = be16_to_cpu(p->ccti_limit);
-       if (ccti_limit + 1 > eentry) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
-       if (!new_cc_state)
-               goto getit;
-
-       spin_lock(&ppd->cc_state_lock);
-
-       old_cc_state = get_cc_state(ppd);
-
-       if (!old_cc_state) {
-               spin_unlock(&ppd->cc_state_lock);
-               kfree(new_cc_state);
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       *new_cc_state = *old_cc_state;
-
-       new_cc_state->cct.ccti_limit = ccti_limit;
-
-       entries = ppd->ccti_entries;
-       ppd->total_cct_entry = ccti_limit + 1;
-
-       for (j = 0, i = sentry; i < eentry; j++, i++)
-               entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
-
-       memcpy(new_cc_state->cct.entries, entries,
-              eentry * sizeof(struct ib_cc_table_entry));
-
-       new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
-       new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
-       memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
-              OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
-
-       rcu_assign_pointer(ppd->cc_state, new_cc_state);
-
-       spin_unlock(&ppd->cc_state_lock);
-
-       call_rcu(&old_cc_state->rcu, cc_state_reclaim);
-
-getit:
-       return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
-}
-
-struct opa_led_info {
-       __be32 rsvd_led_mask;
-       __be32 rsvd;
-};
-
-#define OPA_LED_SHIFT  31
-#define OPA_LED_MASK   BIT(OPA_LED_SHIFT)
-
-static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct hfi1_pportdata *ppd = dd->pport;
-       struct opa_led_info *p = (struct opa_led_info *)data;
-       u32 nport = OPA_AM_NPORT(am);
-       u32 is_beaconing_active;
-
-       if (nport != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       /*
-        * This pairs with the memory barrier in hfi1_start_led_override to
-        * ensure that we read the correct state of LED beaconing represented
-        * by led_override_timer_active
-        */
-       smp_rmb();
-       is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
-       p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
-
-       if (resp_len)
-               *resp_len += sizeof(struct opa_led_info);
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
-                                  struct ib_device *ibdev, u8 port,
-                                  u32 *resp_len)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       struct opa_led_info *p = (struct opa_led_info *)data;
-       u32 nport = OPA_AM_NPORT(am);
-       int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
-
-       if (nport != 1) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       if (on)
-               hfi1_start_led_override(dd->pport, 2000, 1500);
-       else
-               shutdown_led_override(dd->pport);
-
-       return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
-}
-
-static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
-                           u32 *resp_len)
-{
-       int ret;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       switch (attr_id) {
-       case IB_SMP_ATTR_NODE_DESC:
-               ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_NODE_INFO:
-               ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PORT_INFO:
-               ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PKEY_TABLE:
-               ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
-               ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
-               ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
-               ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
-               ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case OPA_ATTRIB_ID_PORT_STATE_INFO:
-               ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
-               ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_CABLE_INFO:
-               ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case IB_SMP_ATTR_VL_ARB_TABLE:
-               ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
-                                           resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_INFO:
-               ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
-               ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
-                                                 port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
-               ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
-                                                  port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
-               ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_LED_INFO:
-               ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_SM_INFO:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return IB_MAD_RESULT_SUCCESS;
-               /* FALLTHROUGH */
-       default:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-       return ret;
-}
-
-static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
-                           u8 *data, struct ib_device *ibdev, u8 port,
-                           u32 *resp_len)
-{
-       int ret;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       switch (attr_id) {
-       case IB_SMP_ATTR_PORT_INFO:
-               ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_PKEY_TABLE:
-               ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
-               ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
-               ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
-               ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
-                                              resp_len);
-               break;
-       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
-               ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-                                               resp_len);
-               break;
-       case OPA_ATTRIB_ID_PORT_STATE_INFO:
-               ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
-               ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
-                                        resp_len);
-               break;
-       case IB_SMP_ATTR_VL_ARB_TABLE:
-               ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
-                                           resp_len);
-               break;
-       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
-               ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
-                                                 port, resp_len);
-               break;
-       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
-               ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_LED_INFO:
-               ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
-                                             resp_len);
-               break;
-       case IB_SMP_ATTR_SM_INFO:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
-                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return IB_MAD_RESULT_SUCCESS;
-               /* FALLTHROUGH */
-       default:
-               smp->status |= IB_SMP_UNSUP_METH_ATTR;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-       return ret;
-}
-
-static inline void set_aggr_error(struct opa_aggregate *ag)
-{
-       ag->err_reqlength |= cpu_to_be16(0x8000);
-}
-
-static int subn_get_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
-                                 u32 *resp_len)
-{
-       int i;
-       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
-       u8 *next_smp = opa_get_smp_data(smp);
-
-       if (num_attr < 1 || num_attr > 117) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < num_attr; i++) {
-               struct opa_aggregate *agg;
-               size_t agg_data_len;
-               size_t agg_size;
-               u32 am;
-
-               agg = (struct opa_aggregate *)next_smp;
-               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
-               agg_size = sizeof(*agg) + agg_data_len;
-               am = be32_to_cpu(agg->attr_mod);
-
-               *resp_len += agg_size;
-
-               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-
-               /* zero the payload for this segment */
-               memset(next_smp + sizeof(*agg), 0, agg_data_len);
-
-               (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
-                                       ibdev, port, NULL);
-               if (smp->status & ~IB_SMP_DIRECTION) {
-                       set_aggr_error(agg);
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-               next_smp += agg_size;
-       }
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-static int subn_set_opa_aggregate(struct opa_smp *smp,
-                                 struct ib_device *ibdev, u8 port,
-                                 u32 *resp_len)
-{
-       int i;
-       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
-       u8 *next_smp = opa_get_smp_data(smp);
-
-       if (num_attr < 1 || num_attr > 117) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               return reply((struct ib_mad_hdr *)smp);
-       }
-
-       for (i = 0; i < num_attr; i++) {
-               struct opa_aggregate *agg;
-               size_t agg_data_len;
-               size_t agg_size;
-               u32 am;
-
-               agg = (struct opa_aggregate *)next_smp;
-               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
-               agg_size = sizeof(*agg) + agg_data_len;
-               am = be32_to_cpu(agg->attr_mod);
-
-               *resp_len += agg_size;
-
-               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
-                       smp->status |= IB_SMP_INVALID_FIELD;
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-
-               (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
-                                       ibdev, port, NULL);
-               if (smp->status & ~IB_SMP_DIRECTION) {
-                       set_aggr_error(agg);
-                       return reply((struct ib_mad_hdr *)smp);
-               }
-               next_smp += agg_size;
-       }
-
-       return reply((struct ib_mad_hdr *)smp);
-}
-
-/*
- * OPAv1 specifies that, on the transition to link up, these counters
- * are cleared:
- *   PortRcvErrors [*]
- *   LinkErrorRecovery
- *   LocalLinkIntegrityErrors
- *   ExcessiveBufferOverruns [*]
- *
- * [*] Error info associated with these counters is retained, but the
- * error info status is reset to 0.
- */
-void clear_linkup_counters(struct hfi1_devdata *dd)
-{
-       /* PortRcvErrors */
-       write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
-       dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
-       /* LinkErrorRecovery */
-       write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
-       write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
-       /* LocalLinkIntegrityErrors */
-       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
-       write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       /* ExcessiveBufferOverruns */
-       write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
-       dd->rcv_ovfl_cnt = 0;
-       dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
-}
-
-/*
- * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
- * local node, 0 otherwise.
- */
-static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
-                       const struct ib_wc *in_wc)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       const struct opa_smp *smp = (const struct opa_smp *)mad;
-
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-               return (smp->hop_cnt == 0 &&
-                       smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
-                       smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
-       }
-
-       return (in_wc->slid == ppd->lid);
-}
-
-/*
- * opa_local_smp_check() should only be called on MADs for which
- * is_local_mad() returns true. It applies the SMP checks that are
- * specific to SMPs which are sent from, and destined to this node.
- * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
- * otherwise.
- *
- * SMPs which arrive from other nodes are instead checked by
- * opa_smp_check().
- */
-static int opa_local_smp_check(struct hfi1_ibport *ibp,
-                              const struct ib_wc *in_wc)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u16 slid = in_wc->slid;
-       u16 pkey;
-
-       if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
-               return 1;
-
-       pkey = ppd->pkeys[in_wc->pkey_index];
-       /*
-        * We need to do the "node-local" checks specified in OPAv1,
-        * rev 0.90, section 9.10.26, which are:
-        *   - pkey is 0x7fff, or 0xffff
-        *   - Source QPN == 0 || Destination QPN == 0
-        *   - the MAD header's management class is either
-        *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
-        *     IB_MGMT_CLASS_SUBN_LID_ROUTED
-        *   - SLID != 0
-        *
-        * However, we know (and so don't need to check again) that,
-        * for local SMPs, the MAD stack passes MADs with:
-        *   - Source QPN of 0
-        *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
-        *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
-        *     our own port's lid
-        *
-        */
-       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
-               return 0;
-       ingress_pkey_table_fail(ppd, pkey, slid);
-       return 1;
-}
-
-static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
-                           u8 port, const struct opa_mad *in_mad,
-                           struct opa_mad *out_mad,
-                           u32 *resp_len)
-{
-       struct opa_smp *smp = (struct opa_smp *)out_mad;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       u8 *data;
-       u32 am;
-       __be16 attr_id;
-       int ret;
-
-       *out_mad = *in_mad;
-       data = opa_get_smp_data(smp);
-
-       am = be32_to_cpu(smp->attr_mod);
-       attr_id = smp->attr_id;
-       if (smp->class_version != OPA_SMI_CLASS_VERSION) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)smp);
-               return ret;
-       }
-       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
-                        smp->route.dr.dr_slid, smp->route.dr.return_path,
-                        smp->hop_cnt);
-       if (ret) {
-               u32 port_num = be32_to_cpu(smp->attr_mod);
-
-               /*
-                * If this is a get/set portinfo, we already check the
-                * M_Key if the MAD is for another port and the M_Key
-                * is OK on the receiving port. This check is needed
-                * to increment the error counters when the M_Key
-                * fails to match on *both* ports.
-                */
-               if (attr_id == IB_SMP_ATTR_PORT_INFO &&
-                   (smp->method == IB_MGMT_METHOD_GET ||
-                    smp->method == IB_MGMT_METHOD_SET) &&
-                   port_num && port_num <= ibdev->phys_port_cnt &&
-                   port != port_num)
-                       (void)check_mkey(to_iport(ibdev, port_num),
-                                         (struct ib_mad_hdr *)smp, 0,
-                                         smp->mkey, smp->route.dr.dr_slid,
-                                         smp->route.dr.return_path,
-                                         smp->hop_cnt);
-               ret = IB_MAD_RESULT_FAILURE;
-               return ret;
-       }
-
-       *resp_len = opa_get_smp_header_size(smp);
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (attr_id) {
-               default:
-                       clear_opa_smp_data(smp);
-                       ret = subn_get_opa_sma(attr_id, smp, am, data,
-                                              ibdev, port, resp_len);
-                       break;
-               case OPA_ATTRIB_ID_AGGREGATE:
-                       ret = subn_get_opa_aggregate(smp, ibdev, port,
-                                                    resp_len);
-                       break;
-               }
-               break;
-       case IB_MGMT_METHOD_SET:
-               switch (attr_id) {
-               default:
-                       ret = subn_set_opa_sma(attr_id, smp, am, data,
-                                              ibdev, port, resp_len);
-                       break;
-               case OPA_ATTRIB_ID_AGGREGATE:
-                       ret = subn_set_opa_aggregate(smp, ibdev, port,
-                                                    resp_len);
-                       break;
-               }
-               break;
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_REPORT_RESP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-       default:
-               smp->status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)smp);
-               break;
-       }
-
-       return ret;
-}
-
-static int process_subn(struct ib_device *ibdev, int mad_flags,
-                       u8 port, const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_smp *smp = (struct ib_smp *)out_mad;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-       int ret;
-
-       *out_mad = *in_mad;
-       if (smp->class_version != 1) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)smp);
-               return ret;
-       }
-
-       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
-                        smp->mkey, (__force __be32)smp->dr_slid,
-                        smp->return_path, smp->hop_cnt);
-       if (ret) {
-               u32 port_num = be32_to_cpu(smp->attr_mod);
-
-               /*
-                * If this is a get/set portinfo, we already check the
-                * M_Key if the MAD is for another port and the M_Key
-                * is OK on the receiving port. This check is needed
-                * to increment the error counters when the M_Key
-                * fails to match on *both* ports.
-                */
-               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
-                   (smp->method == IB_MGMT_METHOD_GET ||
-                    smp->method == IB_MGMT_METHOD_SET) &&
-                   port_num && port_num <= ibdev->phys_port_cnt &&
-                   port != port_num)
-                       (void)check_mkey(to_iport(ibdev, port_num),
-                                        (struct ib_mad_hdr *)smp, 0,
-                                        smp->mkey,
-                                        (__force __be32)smp->dr_slid,
-                                        smp->return_path, smp->hop_cnt);
-               ret = IB_MAD_RESULT_FAILURE;
-               return ret;
-       }
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_NODE_INFO:
-                       ret = subn_get_nodeinfo(smp, ibdev, port);
-                       break;
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)smp);
-                       break;
-               }
-               break;
-       }
-
-       return ret;
-}
-
-static int process_perf(struct ib_device *ibdev, u8 port,
-                       const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
-       struct ib_class_port_info *cpi = (struct ib_class_port_info *)
-                                               &pmp->data;
-       int ret = IB_MAD_RESULT_FAILURE;
-
-       *out_mad = *in_mad;
-       if (pmp->mad_hdr.class_version != 1) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               return ret;
-       }
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_PORT_COUNTERS:
-                       ret = pma_get_ib_portcounters(pmp, ibdev, port);
-                       break;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
-                       break;
-               case IB_PMA_CLASS_PORT_INFO:
-                       cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_SET:
-               if (pmp->mad_hdr.attr_id) {
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-               }
-               break;
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               break;
-       }
-
-       return ret;
-}
-
-static int process_perf_opa(struct ib_device *ibdev, u8 port,
-                           const struct opa_mad *in_mad,
-                           struct opa_mad *out_mad, u32 *resp_len)
-{
-       struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
-       int ret;
-
-       *out_mad = *in_mad;
-
-       if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               return reply((struct ib_mad_hdr *)pmp);
-       }
-
-       *resp_len = sizeof(pmp->mad_hdr);
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_CLASS_PORT_INFO:
-                       ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_PORT_STATUS:
-                       ret = pma_get_opa_portstatus(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
-                       ret = pma_get_opa_datacounters(pmp, ibdev, port,
-                                                      resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
-                       ret = pma_get_opa_porterrors(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_INFO:
-                       ret = pma_get_opa_errorinfo(pmp, ibdev, port,
-                                                   resp_len);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_SET:
-               switch (pmp->mad_hdr.attr_id) {
-               case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
-                       ret = pma_set_opa_portstatus(pmp, ibdev, port,
-                                                    resp_len);
-                       break;
-               case OPA_PM_ATTRIB_ID_ERROR_INFO:
-                       ret = pma_set_opa_errorinfo(pmp, ibdev, port,
-                                                   resp_len);
-                       break;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_mad_hdr *)pmp);
-                       break;
-               }
-               break;
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_mad_hdr *)pmp);
-               break;
-       }
-
-       return ret;
-}
-
-static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
-                               u8 port, const struct ib_wc *in_wc,
-                               const struct ib_grh *in_grh,
-                               const struct opa_mad *in_mad,
-                               struct opa_mad *out_mad, size_t *out_mad_size,
-                               u16 *out_mad_pkey_index)
-{
-       int ret;
-       int pkey_idx;
-       u32 resp_len = 0;
-       struct hfi1_ibport *ibp = to_iport(ibdev, port);
-
-       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
-       if (pkey_idx < 0) {
-               pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
-                       hfi1_get_pkey(ibp, 1));
-               pkey_idx = 1;
-       }
-       *out_mad_pkey_index = (u16)pkey_idx;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               if (is_local_mad(ibp, in_mad, in_wc)) {
-                       ret = opa_local_smp_check(ibp, in_wc);
-                       if (ret)
-                               return IB_MAD_RESULT_FAILURE;
-               }
-               ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
-                                      out_mad, &resp_len);
-               goto bail;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf_opa(ibdev, port, in_mad, out_mad,
-                                      &resp_len);
-               goto bail;
-
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-       }
-
-bail:
-       if (ret & IB_MAD_RESULT_REPLY)
-               *out_mad_size = round_up(resp_len, 8);
-       else if (ret & IB_MAD_RESULT_SUCCESS)
-               *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
-
-       return ret;
-}
-
-static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                              const struct ib_wc *in_wc,
-                              const struct ib_grh *in_grh,
-                              const struct ib_mad *in_mad,
-                              struct ib_mad *out_mad)
-{
-       int ret;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
-               break;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf(ibdev, port, in_mad, out_mad);
-               break;
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-               break;
-       }
-
-       return ret;
-}
-
-/**
- * hfi1_process_mad - process an incoming MAD packet
- * @ibdev: the infiniband device this packet came in on
- * @mad_flags: MAD flags
- * @port: the port number this packet came in on
- * @in_wc: the work completion entry for this packet
- * @in_grh: the global route header for this packet
- * @in_mad: the incoming MAD
- * @out_mad: any outgoing MAD reply
- *
- * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
- * interested in processing.
- *
- * Note that the verbs framework has already done the MAD sanity checks,
- * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
- * MADs.
- *
- * This is called by the ib_mad module.
- */
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index)
-{
-       switch (in_mad->base_version) {
-       case OPA_MGMT_BASE_VERSION:
-               if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
-                       dev_err(ibdev->dma_device, "invalid in_mad_size\n");
-                       return IB_MAD_RESULT_FAILURE;
-               }
-               return hfi1_process_opa_mad(ibdev, mad_flags, port,
-                                           in_wc, in_grh,
-                                           (struct opa_mad *)in_mad,
-                                           (struct opa_mad *)out_mad,
-                                           out_mad_size,
-                                           out_mad_pkey_index);
-       case IB_MGMT_BASE_VERSION:
-               return hfi1_process_ib_mad(ibdev, mad_flags, port,
-                                         in_wc, in_grh,
-                                         (const struct ib_mad *)in_mad,
-                                         (struct ib_mad *)out_mad);
-       default:
-               break;
-       }
-
-       return IB_MAD_RESULT_FAILURE;
-}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
deleted file mode 100644 (file)
index 55ee086..0000000
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_MAD_H
-#define _HFI1_MAD_H
-
-#include <rdma/ib_pma.h>
-#define USE_PI_LED_ENABLE      1 /*
-                                  * use led enabled bit in struct
-                                  * opa_port_states, if available
-                                  */
-#include <rdma/opa_smi.h>
-#include <rdma/opa_port_info.h>
-#ifndef PI_LED_ENABLE_SUP
-#define PI_LED_ENABLE_SUP 0
-#endif
-#include "opa_compat.h"
-
-/*
- * OPA Traps
- */
-#define OPA_TRAP_GID_NOW_IN_SERVICE             cpu_to_be16(64)
-#define OPA_TRAP_GID_OUT_OF_SERVICE             cpu_to_be16(65)
-#define OPA_TRAP_ADD_MULTICAST_GROUP            cpu_to_be16(66)
-#define OPA_TRAL_DEL_MULTICAST_GROUP            cpu_to_be16(67)
-#define OPA_TRAP_UNPATH                         cpu_to_be16(68)
-#define OPA_TRAP_REPATH                         cpu_to_be16(69)
-#define OPA_TRAP_PORT_CHANGE_STATE              cpu_to_be16(128)
-#define OPA_TRAP_LINK_INTEGRITY                 cpu_to_be16(129)
-#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN       cpu_to_be16(130)
-#define OPA_TRAP_FLOW_WATCHDOG                  cpu_to_be16(131)
-#define OPA_TRAP_CHANGE_CAPABILITY              cpu_to_be16(144)
-#define OPA_TRAP_CHANGE_SYSGUID                 cpu_to_be16(145)
-#define OPA_TRAP_BAD_M_KEY                      cpu_to_be16(256)
-#define OPA_TRAP_BAD_P_KEY                      cpu_to_be16(257)
-#define OPA_TRAP_BAD_Q_KEY                      cpu_to_be16(258)
-#define OPA_TRAP_SWITCH_BAD_PKEY                cpu_to_be16(259)
-#define OPA_SMA_TRAP_DATA_LINK_WIDTH            cpu_to_be16(2048)
-
-/*
- * Generic trap/notice other local changes flags (trap 144).
- */
-#define        OPA_NOTICE_TRAP_LWDE_CHG        0x08 /* Link Width Downgrade Enable
-                                             * changed
-                                             */
-#define OPA_NOTICE_TRAP_LSE_CHG         0x04 /* Link Speed Enable changed */
-#define OPA_NOTICE_TRAP_LWE_CHG         0x02 /* Link Width Enable changed */
-#define OPA_NOTICE_TRAP_NODE_DESC_CHG   0x01
-
-struct opa_mad_notice_attr {
-       u8 generic_type;
-       u8 prod_type_msb;
-       __be16 prod_type_lsb;
-       __be16 trap_num;
-       __be16 toggle_count;
-       __be32 issuer_lid;
-       __be32 reserved1;
-       union ib_gid issuer_gid;
-
-       union {
-               struct {
-                       u8      details[64];
-               } raw_data;
-
-               struct {
-                       union ib_gid    gid;
-               } __packed ntc_64_65_66_67;
-
-               struct {
-                       __be32  lid;
-               } __packed ntc_128;
-
-               struct {
-                       __be32  lid;            /* where violation happened */
-                       u8      port_num;       /* where violation happened */
-               } __packed ntc_129_130_131;
-
-               struct {
-                       __be32  lid;            /* LID where change occurred */
-                       __be32  new_cap_mask;   /* new capability mask */
-                       __be16  reserved2;
-                       __be16  cap_mask;
-                       __be16  change_flags;   /* low 4 bits only */
-               } __packed ntc_144;
-
-               struct {
-                       __be64  new_sys_guid;
-                       __be32  lid;            /* lid where sys guid changed */
-               } __packed ntc_145;
-
-               struct {
-                       __be32  lid;
-                       __be32  dr_slid;
-                       u8      method;
-                       u8      dr_trunc_hop;
-                       __be16  attr_id;
-                       __be32  attr_mod;
-                       __be64  mkey;
-                       u8      dr_rtn_path[30];
-               } __packed ntc_256;
-
-               struct {
-                       __be32          lid1;
-                       __be32          lid2;
-                       __be32          key;
-                       u8              sl;     /* SL: high 5 bits */
-                       u8              reserved3[3];
-                       union ib_gid    gid1;
-                       union ib_gid    gid2;
-                       __be32          qp1;    /* high 8 bits reserved */
-                       __be32          qp2;    /* high 8 bits reserved */
-               } __packed ntc_257_258;
-
-               struct {
-                       __be16          flags;  /* low 8 bits reserved */
-                       __be16          pkey;
-                       __be32          lid1;
-                       __be32          lid2;
-                       u8              sl;     /* SL: high 5 bits */
-                       u8              reserved4[3];
-                       union ib_gid    gid1;
-                       union ib_gid    gid2;
-                       __be32          qp1;    /* high 8 bits reserved */
-                       __be32          qp2;    /* high 8 bits reserved */
-               } __packed ntc_259;
-
-               struct {
-                       __be32  lid;
-               } __packed ntc_2048;
-
-       };
-       u8      class_data[0];
-};
-
-#define IB_VLARB_LOWPRI_0_31    1
-#define IB_VLARB_LOWPRI_32_63   2
-#define IB_VLARB_HIGHPRI_0_31   3
-#define IB_VLARB_HIGHPRI_32_63  4
-
-#define OPA_MAX_PREEMPT_CAP         32
-#define OPA_VLARB_LOW_ELEMENTS       0
-#define OPA_VLARB_HIGH_ELEMENTS      1
-#define OPA_VLARB_PREEMPT_ELEMENTS   2
-#define OPA_VLARB_PREEMPT_MATRIX     3
-
-#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
-
-struct ib_pma_portcounters_cong {
-       u8 reserved;
-       u8 reserved1;
-       __be16 port_check_rate;
-       __be16 symbol_error_counter;
-       u8 link_error_recovery_counter;
-       u8 link_downed_counter;
-       __be16 port_rcv_errors;
-       __be16 port_rcv_remphys_errors;
-       __be16 port_rcv_switch_relay_errors;
-       __be16 port_xmit_discards;
-       u8 port_xmit_constraint_errors;
-       u8 port_rcv_constraint_errors;
-       u8 reserved2;
-       u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
-       __be16 reserved3;
-       __be16 vl15_dropped;
-       __be64 port_xmit_data;
-       __be64 port_rcv_data;
-       __be64 port_xmit_packets;
-       __be64 port_rcv_packets;
-       __be64 port_xmit_wait;
-       __be64 port_adr_events;
-} __packed;
-
-#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
-
-#define OPA_MAX_PREEMPT_CAP         32
-#define OPA_VLARB_LOW_ELEMENTS       0
-#define OPA_VLARB_HIGH_ELEMENTS      1
-#define OPA_VLARB_PREEMPT_ELEMENTS   2
-#define OPA_VLARB_PREEMPT_MATRIX     3
-
-#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
-#define HFI1_XMIT_RATE_PICO                      0x7
-/* number of 4nsec cycles equaling 2secs */
-#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
-
-#define IB_CC_SVCTYPE_RC 0x0
-#define IB_CC_SVCTYPE_UC 0x1
-#define IB_CC_SVCTYPE_RD 0x2
-#define IB_CC_SVCTYPE_UD 0x3
-
-/*
- * There should be an equivalent IB #define for the following, but
- * I cannot find it.
- */
-#define OPA_CC_LOG_TYPE_HFI    2
-
-struct opa_hfi1_cong_log_event_internal {
-       u32 lqpn;
-       u32 rqpn;
-       u8 sl;
-       u8 svc_type;
-       u32 rlid;
-       s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
-};
-
-struct opa_hfi1_cong_log_event {
-       u8 local_qp_cn_entry[3];
-       u8 remote_qp_number_cn_entry[3];
-       u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
-       u8 reserved;
-       __be32 remote_lid_cn_entry;
-       __be32 timestamp_cn_entry;
-} __packed;
-
-#define OPA_CONG_LOG_ELEMS     96
-
-struct opa_hfi1_cong_log {
-       u8 log_type;
-       u8 congestion_flags;
-       __be16 threshold_event_counter;
-       __be32 current_time_stamp;
-       u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
-       struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
-} __packed;
-
-#define IB_CC_TABLE_CAP_DEFAULT 31
-
-/* Port control flags */
-#define IB_CC_CCS_PC_SL_BASED 0x01
-
-struct opa_congestion_setting_entry {
-       u8 ccti_increase;
-       u8 reserved;
-       __be16 ccti_timer;
-       u8 trigger_threshold;
-       u8 ccti_min; /* min CCTI for cc table */
-} __packed;
-
-struct opa_congestion_setting_entry_shadow {
-       u8 ccti_increase;
-       u8 reserved;
-       u16 ccti_timer;
-       u8 trigger_threshold;
-       u8 ccti_min; /* min CCTI for cc table */
-} __packed;
-
-struct opa_congestion_setting_attr {
-       __be32 control_map;
-       __be16 port_control;
-       struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
-} __packed;
-
-struct opa_congestion_setting_attr_shadow {
-       u32 control_map;
-       u16 port_control;
-       struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
-} __packed;
-
-#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
-#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
-
-/* 64 Congestion Control table entries in a single MAD */
-#define IB_CCT_ENTRIES 64
-#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
-
-struct ib_cc_table_entry {
-       __be16 entry; /* shift:2, multiplier:14 */
-};
-
-struct ib_cc_table_entry_shadow {
-       u16 entry; /* shift:2, multiplier:14 */
-};
-
-struct ib_cc_table_attr {
-       __be16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
-} __packed;
-
-struct ib_cc_table_attr_shadow {
-       u16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
-} __packed;
-
-#define CC_TABLE_SHADOW_MAX \
-       (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
-
-struct cc_table_shadow {
-       u16 ccti_limit; /* max CCTI for cc table */
-       struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
-} __packed;
-
-/*
- * struct cc_state combines the (active) per-port congestion control
- * table, and the (active) per-SL congestion settings. cc_state data
- * may need to be read in code paths that we want to be fast, so it
- * is an RCU protected structure.
- */
-struct cc_state {
-       struct rcu_head rcu;
-       struct cc_table_shadow cct;
-       struct opa_congestion_setting_attr_shadow cong_setting;
-};
-
-/*
- * OPA BufferControl MAD
- */
-
-/* attribute modifier macros */
-#define OPA_AM_NPORT_SHIFT     24
-#define OPA_AM_NPORT_MASK      0xff
-#define OPA_AM_NPORT_SMASK     (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
-#define OPA_AM_NPORT(am)       (((am) >> OPA_AM_NPORT_SHIFT) & \
-                                       OPA_AM_NPORT_MASK)
-
-#define OPA_AM_NBLK_SHIFT      24
-#define OPA_AM_NBLK_MASK       0xff
-#define OPA_AM_NBLK_SMASK      (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
-#define OPA_AM_NBLK(am)                (((am) >> OPA_AM_NBLK_SHIFT) & \
-                                       OPA_AM_NBLK_MASK)
-
-#define OPA_AM_START_BLK_SHIFT 0
-#define OPA_AM_START_BLK_MASK  0xff
-#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
-                                       OPA_AM_START_BLK_SHIFT)
-#define OPA_AM_START_BLK(am)   (((am) >> OPA_AM_START_BLK_SHIFT) & \
-                                       OPA_AM_START_BLK_MASK)
-
-#define OPA_AM_PORTNUM_SHIFT   0
-#define OPA_AM_PORTNUM_MASK    0xff
-#define OPA_AM_PORTNUM_SMASK   (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
-#define OPA_AM_PORTNUM(am)     (((am) >> OPA_AM_PORTNUM_SHIFT) & \
-                                       OPA_AM_PORTNUM_MASK)
-
-#define OPA_AM_ASYNC_SHIFT     12
-#define OPA_AM_ASYNC_MASK      0x1
-#define OPA_AM_ASYNC_SMASK     (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
-#define OPA_AM_ASYNC(am)       (((am) >> OPA_AM_ASYNC_SHIFT) & \
-                                       OPA_AM_ASYNC_MASK)
-
-#define OPA_AM_START_SM_CFG_SHIFT      9
-#define OPA_AM_START_SM_CFG_MASK       0x1
-#define OPA_AM_START_SM_CFG_SMASK      (OPA_AM_START_SM_CFG_MASK << \
-                                               OPA_AM_START_SM_CFG_SHIFT)
-#define OPA_AM_START_SM_CFG(am)                (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
-                                               & OPA_AM_START_SM_CFG_MASK)
-
-#define OPA_AM_CI_ADDR_SHIFT   19
-#define OPA_AM_CI_ADDR_MASK    0xfff
-#define OPA_AM_CI_ADDR_SMASK   (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
-#define OPA_AM_CI_ADDR(am)     (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
-                                       OPA_AM_CI_ADDR_MASK)
-
-#define OPA_AM_CI_LEN_SHIFT    13
-#define OPA_AM_CI_LEN_MASK     0x3f
-#define OPA_AM_CI_LEN_SMASK    (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
-#define OPA_AM_CI_LEN(am)      (((am) >> OPA_AM_CI_LEN_SHIFT) & \
-                                       OPA_AM_CI_LEN_MASK)
-
-/* error info macros */
-#define OPA_EI_STATUS_SMASK    0x80
-#define OPA_EI_CODE_SMASK      0x0f
-
-struct vl_limit {
-       __be16 dedicated;
-       __be16 shared;
-};
-
-struct buffer_control {
-       __be16 reserved;
-       __be16 overall_shared_limit;
-       struct vl_limit vl[OPA_MAX_VLS];
-};
-
-struct sc2vlnt {
-       u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
-};
-
-/*
- * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
- * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
- * We support 5 counters which only count the mandatory quantities.
- */
-#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 \
-       cpu_to_be32(COUNTER_MASK(1, 0) | \
-                   COUNTER_MASK(1, 1) | \
-                   COUNTER_MASK(1, 2) | \
-                   COUNTER_MASK(1, 3) | \
-                   COUNTER_MASK(1, 4))
-
-#endif                         /* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c
deleted file mode 100644 (file)
index 2b0e91d..0000000
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/list.h>
-#include <linux/mmu_notifier.h>
-#include <linux/interval_tree_generic.h>
-
-#include "mmu_rb.h"
-#include "trace.h"
-
-struct mmu_rb_handler {
-       struct list_head list;
-       struct mmu_notifier mn;
-       struct rb_root *root;
-       spinlock_t lock;        /* protect the RB tree */
-       struct mmu_rb_ops *ops;
-};
-
-static LIST_HEAD(mmu_rb_handlers);
-static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
-
-static unsigned long mmu_node_start(struct mmu_rb_node *);
-static unsigned long mmu_node_last(struct mmu_rb_node *);
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
-static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
-                                    unsigned long);
-static inline void mmu_notifier_range_start(struct mmu_notifier *,
-                                           struct mm_struct *,
-                                           unsigned long, unsigned long);
-static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
-                                       struct mm_struct *,
-                                       unsigned long, unsigned long);
-static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
-                                          unsigned long, unsigned long);
-
-static struct mmu_notifier_ops mn_opts = {
-       .invalidate_page = mmu_notifier_page,
-       .invalidate_range_start = mmu_notifier_range_start,
-};
-
-INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
-                    mmu_node_start, mmu_node_last, static, __mmu_int_rb);
-
-static unsigned long mmu_node_start(struct mmu_rb_node *node)
-{
-       return node->addr & PAGE_MASK;
-}
-
-static unsigned long mmu_node_last(struct mmu_rb_node *node)
-{
-       return PAGE_ALIGN(node->addr + node->len) - 1;
-}
-
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
-{
-       struct mmu_rb_handler *handlr;
-       unsigned long flags;
-
-       if (!ops->invalidate)
-               return -EINVAL;
-
-       handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
-       if (!handlr)
-               return -ENOMEM;
-
-       handlr->root = root;
-       handlr->ops = ops;
-       INIT_HLIST_NODE(&handlr->mn.hlist);
-       spin_lock_init(&handlr->lock);
-       handlr->mn.ops = &mn_opts;
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_add_tail(&handlr->list, &mmu_rb_handlers);
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-
-       return mmu_notifier_register(&handlr->mn, current->mm);
-}
-
-void hfi1_mmu_rb_unregister(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       unsigned long flags;
-
-       if (!handler)
-               return;
-
-       /* Unregister first so we don't get any more notifications. */
-       if (current->mm)
-               mmu_notifier_unregister(&handler->mn, current->mm);
-
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_del(&handler->list);
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       if (!RB_EMPTY_ROOT(root)) {
-               struct rb_node *node;
-               struct mmu_rb_node *rbnode;
-
-               while ((node = rb_first(root))) {
-                       rbnode = rb_entry(node, struct mmu_rb_node, node);
-                       rb_erase(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, rbnode, NULL);
-               }
-       }
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       kfree(handler);
-}
-
-int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-       int ret = 0;
-
-       if (!handler)
-               return -EINVAL;
-
-       spin_lock_irqsave(&handler->lock, flags);
-       hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
-                 mnode->len);
-       node = __mmu_rb_search(handler, mnode->addr, mnode->len);
-       if (node) {
-               ret = -EINVAL;
-               goto unlock;
-       }
-       __mmu_int_rb_insert(mnode, root);
-
-       if (handler->ops->insert) {
-               ret = handler->ops->insert(root, mnode);
-               if (ret)
-                       __mmu_int_rb_remove(mnode, root);
-       }
-unlock:
-       spin_unlock_irqrestore(&handler->lock, flags);
-       return ret;
-}
-
-/* Caller must hold handler lock */
-static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
-                                          unsigned long addr,
-                                          unsigned long len)
-{
-       struct mmu_rb_node *node = NULL;
-
-       hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
-       if (!handler->ops->filter) {
-               node = __mmu_int_rb_iter_first(handler->root, addr,
-                                              (addr + len) - 1);
-       } else {
-               for (node = __mmu_int_rb_iter_first(handler->root, addr,
-                                                   (addr + len) - 1);
-                    node;
-                    node = __mmu_int_rb_iter_next(node, addr,
-                                                  (addr + len) - 1)) {
-                       if (handler->ops->filter(node, addr, len))
-                               return node;
-               }
-       }
-       return node;
-}
-
-/* Caller must *not* hold handler lock. */
-static void __mmu_rb_remove(struct mmu_rb_handler *handler,
-                           struct mmu_rb_node *node, struct mm_struct *mm)
-{
-       unsigned long flags;
-
-       /* Validity of handler and node pointers has been checked by caller. */
-       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
-                 node->len);
-       spin_lock_irqsave(&handler->lock, flags);
-       __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       if (handler->ops->remove)
-               handler->ops->remove(handler->root, node, mm);
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
-                                      unsigned long len)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       return node;
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
-                                       unsigned long addr, unsigned long len)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
-       unsigned long flags;
-
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
-       spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       if (node)
-               __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       return node;
-}
-
-void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
-{
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-
-       if (!handler || !node)
-               return;
-
-       __mmu_rb_remove(handler, node, NULL);
-}
-
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler;
-       unsigned long flags;
-
-       spin_lock_irqsave(&mmu_rb_lock, flags);
-       list_for_each_entry(handler, &mmu_rb_handlers, list) {
-               if (handler->root == root)
-                       goto unlock;
-       }
-       handler = NULL;
-unlock:
-       spin_unlock_irqrestore(&mmu_rb_lock, flags);
-       return handler;
-}
-
-static inline void mmu_notifier_page(struct mmu_notifier *mn,
-                                    struct mm_struct *mm, unsigned long addr)
-{
-       mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
-}
-
-static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
-                                           struct mm_struct *mm,
-                                           unsigned long start,
-                                           unsigned long end)
-{
-       mmu_notifier_mem_invalidate(mn, mm, start, end);
-}
-
-static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long start, unsigned long end)
-{
-       struct mmu_rb_handler *handler =
-               container_of(mn, struct mmu_rb_handler, mn);
-       struct rb_root *root = handler->root;
-       struct mmu_rb_node *node, *ptr = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&handler->lock, flags);
-       for (node = __mmu_int_rb_iter_first(root, start, end - 1);
-            node; node = ptr) {
-               /* Guard against node removal. */
-               ptr = __mmu_int_rb_iter_next(node, start, end - 1);
-               hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
-                         node->addr, node->len);
-               if (handler->ops->invalidate(root, node)) {
-                       __mmu_int_rb_remove(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, node, mm);
-               }
-       }
-       spin_unlock_irqrestore(&handler->lock, flags);
-}
diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h
deleted file mode 100644 (file)
index 7a57b9c..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef _HFI1_MMU_RB_H
-#define _HFI1_MMU_RB_H
-
-#include "hfi.h"
-
-struct mmu_rb_node {
-       unsigned long addr;
-       unsigned long len;
-       unsigned long __last;
-       struct rb_node node;
-};
-
-struct mmu_rb_ops {
-       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
-       int (*insert)(struct rb_root *, struct mmu_rb_node *);
-       void (*remove)(struct rb_root *, struct mmu_rb_node *,
-                      struct mm_struct *);
-       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
-};
-
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
-void hfi1_mmu_rb_unregister(struct rb_root *);
-int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
-                                      unsigned long);
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
-                                       unsigned long);
-
-#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
deleted file mode 100644 (file)
index 6ef3c1c..0000000
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef _LINUX_H
-#define _LINUX_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/*
- * This header file is for OPA-specific definitions which are
- * required by the HFI driver, and which aren't yet in the Linux
- * IB core. We'll collect these all here, then merge them into
- * the kernel when that's convenient.
- */
-
-/* OPA SMA attribute IDs */
-#define OPA_ATTRIB_ID_CONGESTION_INFO          cpu_to_be16(0x008b)
-#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG       cpu_to_be16(0x008f)
-#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING   cpu_to_be16(0x0090)
-#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
-
-/* OPA PMA attribute IDs */
-#define OPA_PM_ATTRIB_ID_PORT_STATUS           cpu_to_be16(0x0040)
-#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS     cpu_to_be16(0x0041)
-#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS    cpu_to_be16(0x0042)
-#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS   cpu_to_be16(0x0043)
-#define OPA_PM_ATTRIB_ID_ERROR_INFO            cpu_to_be16(0x0044)
-
-/* OPA status codes */
-#define OPA_PM_STATUS_REQUEST_TOO_LARGE                cpu_to_be16(0x100)
-
-static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
-{
-       return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
-}
-
-static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
-{
-       return ((ps->portphysstate_portstate &
-                 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
-}
-
-/*
- * OPA port physical states
- * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
- * values.
- *
- * When writing, only values 0-3 are valid, other values are ignored.
- * When reading, 0 is reserved.
- *
- * Returned by the ibphys_portstate() routine.
- */
-enum opa_port_phys_state {
-       IB_PORTPHYSSTATE_NOP = 0,
-       /* 1 is reserved */
-       IB_PORTPHYSSTATE_POLLING = 2,
-       IB_PORTPHYSSTATE_DISABLED = 3,
-       IB_PORTPHYSSTATE_TRAINING = 4,
-       IB_PORTPHYSSTATE_LINKUP = 5,
-       IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
-       IB_PORTPHYSSTATE_PHY_TEST = 7,
-       /* 8 is reserved */
-       OPA_PORTPHYSSTATE_OFFLINE = 9,
-       OPA_PORTPHYSSTATE_GANGED = 10,
-       OPA_PORTPHYSSTATE_TEST = 11,
-       OPA_PORTPHYSSTATE_MAX = 11,
-       /* values 12-15 are reserved/ignored */
-};
-
-#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
deleted file mode 100644 (file)
index 0bac21e..0000000
+++ /dev/null
@@ -1,1338 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/vmalloc.h>
-#include <linux/aer.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-#include "chip_registers.h"
-#include "aspm.h"
-
-/* link speed vector for Gen3 speed - not in Linux headers */
-#define GEN1_SPEED_VECTOR 0x1
-#define GEN2_SPEED_VECTOR 0x2
-#define GEN3_SPEED_VECTOR 0x3
-
-/*
- * This file contains PCIe utility routines.
- */
-
-/*
- * Code to adjust PCIe capabilities.
- */
-static void tune_pcie_caps(struct hfi1_devdata *);
-
-/*
- * Do all the common PCIe setup and initialization.
- * devdata is not yet allocated, and is not allocated until after this
- * routine returns success.  Therefore dd_dev_err() can't be used for error
- * printing.
- */
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret;
-
-       ret = pci_enable_device(pdev);
-       if (ret) {
-               /*
-                * This can happen (in theory) iff:
-                * We did a chip reset, and then failed to reprogram the
-                * BAR, or the chip reset due to an internal error.  We then
-                * unloaded the driver and reloaded it.
-                *
-                * Both reset cases set the BAR back to initial state.  For
-                * the latter case, the AER sticky error bit at offset 0x718
-                * should be set, but the Linux kernel doesn't yet know
-                * about that, it appears.  If the original BAR was retained
-                * in the kernel data structures, this may be OK.
-                */
-               hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
-                              -ret);
-               goto done;
-       }
-
-       ret = pci_request_regions(pdev, DRIVER_NAME);
-       if (ret) {
-               hfi1_early_err(&pdev->dev,
-                              "pci_request_regions fails: err %d\n", -ret);
-               goto bail;
-       }
-
-       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (ret) {
-               /*
-                * If the 64 bit setup fails, try 32 bit.  Some systems
-                * do not setup 64 bit maps on systems with 2GB or less
-                * memory installed.
-                */
-               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-               if (ret) {
-                       hfi1_early_err(&pdev->dev,
-                                      "Unable to set DMA mask: %d\n", ret);
-                       goto bail;
-               }
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-       } else {
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-       }
-       if (ret) {
-               hfi1_early_err(&pdev->dev,
-                              "Unable to set DMA consistent mask: %d\n", ret);
-               goto bail;
-       }
-
-       pci_set_master(pdev);
-       (void)pci_enable_pcie_error_reporting(pdev);
-       goto done;
-
-bail:
-       hfi1_pcie_cleanup(pdev);
-done:
-       return ret;
-}
-
-/*
- * Clean what was done in hfi1_pcie_init()
- */
-void hfi1_pcie_cleanup(struct pci_dev *pdev)
-{
-       pci_disable_device(pdev);
-       /*
-        * Release regions should be called after the disable. OK to
-        * call if request regions has not been called or failed.
-        */
-       pci_release_regions(pdev);
-}
-
-/*
- * Do remaining PCIe setup, once dd is allocated, and save away
- * fields required to re-initialize after a chip reset, or for
- * various other purposes
- */
-int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
-                    const struct pci_device_id *ent)
-{
-       unsigned long len;
-       resource_size_t addr;
-
-       dd->pcidev = pdev;
-       pci_set_drvdata(pdev, dd);
-
-       addr = pci_resource_start(pdev, 0);
-       len = pci_resource_len(pdev, 0);
-
-       /*
-        * The TXE PIO buffers are at the tail end of the chip space.
-        * Cut them off and map them separately.
-        */
-
-       /* sanity check vs expectations */
-       if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
-               dd_dev_err(dd, "chip PIO range does not match\n");
-               return -EINVAL;
-       }
-
-       dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
-       if (!dd->kregbase)
-               return -ENOMEM;
-
-       dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
-       if (!dd->piobase) {
-               iounmap(dd->kregbase);
-               return -ENOMEM;
-       }
-
-       dd->flags |= HFI1_PRESENT;      /* now register routines work */
-
-       dd->kregend = dd->kregbase + TXE_PIO_SEND;
-       dd->physaddr = addr;        /* used for io_remap, etc. */
-
-       /*
-        * Re-map the chip's RcvArray as write-combining to allow us
-        * to write an entire cacheline worth of entries in one shot.
-        * If this re-map fails, just continue - the RcvArray programming
-        * function will handle both cases.
-        */
-       dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
-       dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
-                                    dd->chip_rcv_array_count * 8);
-       dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
-       /*
-        * Save BARs and command to rewrite after device reset.
-        */
-       dd->pcibar0 = addr;
-       dd->pcibar1 = addr >> 32;
-       pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
-       pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                 &dd->pcie_devctl2);
-       pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
-
-       return 0;
-}
-
-/*
- * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
- * to releasing the dd memory.
- * Void because all of the core pcie cleanup functions are void.
- */
-void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
-{
-       u64 __iomem *base = (void __iomem *)dd->kregbase;
-
-       dd->flags &= ~HFI1_PRESENT;
-       dd->kregbase = NULL;
-       iounmap(base);
-       if (dd->rcvarray_wc)
-               iounmap(dd->rcvarray_wc);
-       if (dd->piobase)
-               iounmap(dd->piobase);
-}
-
-/*
- * Do a Function Level Reset (FLR) on the device.
- * Based on static function drivers/pci/pci.c:pcie_flr().
- */
-void hfi1_pcie_flr(struct hfi1_devdata *dd)
-{
-       int i;
-       u16 status;
-
-       /* no need to check for the capability - we know the device has it */
-
-       /* wait for Transaction Pending bit to clear, at most a few ms */
-       for (i = 0; i < 4; i++) {
-               if (i)
-                       msleep((1 << (i - 1)) * 100);
-
-               pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
-               if (!(status & PCI_EXP_DEVSTA_TRPND))
-                       goto clear;
-       }
-
-       dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
-
-clear:
-       pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
-                                PCI_EXP_DEVCTL_BCR_FLR);
-       /* PCIe spec requires the function to be back within 100ms */
-       msleep(100);
-}
-
-static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
-                      struct hfi1_msix_entry *hfi1_msix_entry)
-{
-       int ret;
-       int nvec = *msixcnt;
-       struct msix_entry *msix_entry;
-       int i;
-
-       /*
-        * We can't pass hfi1_msix_entry array to msix_setup
-        * so use a dummy msix_entry array and copy the allocated
-        * irq back to the hfi1_msix_entry array.
-        */
-       msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
-       if (!msix_entry) {
-               ret = -ENOMEM;
-               goto do_intx;
-       }
-
-       for (i = 0; i < nvec; i++)
-               msix_entry[i] = hfi1_msix_entry[i].msix;
-
-       ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
-       if (ret < 0)
-               goto free_msix_entry;
-       nvec = ret;
-
-       for (i = 0; i < nvec; i++)
-               hfi1_msix_entry[i].msix = msix_entry[i];
-
-       kfree(msix_entry);
-       *msixcnt = nvec;
-       return;
-
-free_msix_entry:
-       kfree(msix_entry);
-
-do_intx:
-       dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
-                  nvec, ret);
-       *msixcnt = 0;
-       hfi1_enable_intx(dd->pcidev);
-}
-
-/* return the PCIe link speed from the given link status */
-static u32 extract_speed(u16 linkstat)
-{
-       u32 speed;
-
-       switch (linkstat & PCI_EXP_LNKSTA_CLS) {
-       default: /* not defined, assume Gen1 */
-       case PCI_EXP_LNKSTA_CLS_2_5GB:
-               speed = 2500; /* Gen 1, 2.5GHz */
-               break;
-       case PCI_EXP_LNKSTA_CLS_5_0GB:
-               speed = 5000; /* Gen 2, 5GHz */
-               break;
-       case GEN3_SPEED_VECTOR:
-               speed = 8000; /* Gen 3, 8GHz */
-               break;
-       }
-       return speed;
-}
-
-/* return the PCIe link speed from the given link status */
-static u32 extract_width(u16 linkstat)
-{
-       return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
-}
-
-/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
-static void update_lbus_info(struct hfi1_devdata *dd)
-{
-       u16 linkstat;
-
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
-       dd->lbus_width = extract_width(linkstat);
-       dd->lbus_speed = extract_speed(linkstat);
-       snprintf(dd->lbus_info, sizeof(dd->lbus_info),
-                "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
-}
-
-/*
- * Read in the current PCIe link width and speed.  Find if the link is
- * Gen3 capable.
- */
-int pcie_speeds(struct hfi1_devdata *dd)
-{
-       u32 linkcap;
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       if (!pci_is_pcie(dd->pcidev)) {
-               dd_dev_err(dd, "Can't find PCI Express capability!\n");
-               return -EINVAL;
-       }
-
-       /* find if our max speed is Gen3 and parent supports Gen3 speeds */
-       dd->link_gen3_capable = 1;
-
-       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
-       if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
-               dd_dev_info(dd,
-                           "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
-                           linkcap & PCI_EXP_LNKCAP_SLS);
-               dd->link_gen3_capable = 0;
-       }
-
-       /*
-        * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
-        */
-       if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
-               dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
-               dd->link_gen3_capable = 0;
-       }
-
-       /* obtain the link width and current speed */
-       update_lbus_info(dd);
-
-       dd_dev_info(dd, "%s\n", dd->lbus_info);
-
-       return 0;
-}
-
-/*
- * Returns in *nent:
- *     - actual number of interrupts allocated
- *     - 0 if fell back to INTx.
- */
-void request_msix(struct hfi1_devdata *dd, u32 *nent,
-                 struct hfi1_msix_entry *entry)
-{
-       int pos;
-
-       pos = dd->pcidev->msix_cap;
-       if (*nent && pos) {
-               msix_setup(dd, pos, nent, entry);
-               /* did it, either MSI-X or INTx */
-       } else {
-               *nent = 0;
-               hfi1_enable_intx(dd->pcidev);
-       }
-
-       tune_pcie_caps(dd);
-}
-
-void hfi1_enable_intx(struct pci_dev *pdev)
-{
-       /* first, turn on INTx */
-       pci_intx(pdev, 1);
-       /* then turn off MSI-X */
-       pci_disable_msix(pdev);
-}
-
-/* restore command and BARs after a reset has wiped them out */
-void restore_pci_variables(struct hfi1_devdata *dd)
-{
-       pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
-       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
-       pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
-       pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
-                                  dd->pcie_devctl2);
-       pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
-}
-
-/*
- * BIOS may not set PCIe bus-utilization parameters for best performance.
- * Check and optionally adjust them to maximize our throughput.
- */
-static int hfi1_pcie_caps;
-module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
-MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
-
-uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
-
-static void tune_pcie_caps(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent;
-       u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
-       u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
-
-       /*
-        * Turn on extended tags in DevCtl in case the BIOS has turned it off
-        * to improve WFR SDMA bandwidth
-        */
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
-       if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
-               dd_dev_info(dd, "Enabling PCIe extended tags\n");
-               ectl |= PCI_EXP_DEVCTL_EXT_TAG;
-               pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
-       }
-       /* Find out supported and configured values for parent (root) */
-       parent = dd->pcidev->bus->self;
-       /*
-        * The driver cannot perform the tuning if it does not have
-        * access to the upstream component.
-        */
-       if (!parent)
-               return;
-       if (!pci_is_root_bus(parent->bus)) {
-               dd_dev_info(dd, "Parent not root\n");
-               return;
-       }
-
-       if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
-               return;
-       rc_mpss = parent->pcie_mpss;
-       rc_mps = ffs(pcie_get_mps(parent)) - 8;
-       /* Find out supported and configured values for endpoint (us) */
-       ep_mpss = dd->pcidev->pcie_mpss;
-       ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
-
-       /* Find max payload supported by root, endpoint */
-       if (rc_mpss > ep_mpss)
-               rc_mpss = ep_mpss;
-
-       /* If Supported greater than limit in module param, limit it */
-       if (rc_mpss > (hfi1_pcie_caps & 7))
-               rc_mpss = hfi1_pcie_caps & 7;
-       /* If less than (allowed, supported), bump root payload */
-       if (rc_mpss > rc_mps) {
-               rc_mps = rc_mpss;
-               pcie_set_mps(parent, 128 << rc_mps);
-       }
-       /* If less than (allowed, supported), bump endpoint payload */
-       if (rc_mpss > ep_mps) {
-               ep_mps = rc_mpss;
-               pcie_set_mps(dd->pcidev, 128 << ep_mps);
-       }
-
-       /*
-        * Now the Read Request size.
-        * No field for max supported, but PCIe spec limits it to 4096,
-        * which is code '5' (log2(4096) - 7)
-        */
-       max_mrrs = 5;
-       if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
-               max_mrrs = (hfi1_pcie_caps >> 4) & 7;
-
-       max_mrrs = 128 << max_mrrs;
-       rc_mrrs = pcie_get_readrq(parent);
-       ep_mrrs = pcie_get_readrq(dd->pcidev);
-
-       if (max_mrrs > rc_mrrs) {
-               rc_mrrs = max_mrrs;
-               pcie_set_readrq(parent, rc_mrrs);
-       }
-       if (max_mrrs > ep_mrrs) {
-               ep_mrrs = max_mrrs;
-               pcie_set_readrq(dd->pcidev, ep_mrrs);
-       }
-}
-
-/* End of PCIe capability tuning */
-
-/*
- * From here through hfi1_pci_err_handler definition is invoked via
- * PCI error infrastructure, registered via pci
- */
-static pci_ers_result_t
-pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
-
-       switch (state) {
-       case pci_channel_io_normal:
-               dd_dev_info(dd, "State Normal, ignoring\n");
-               break;
-
-       case pci_channel_io_frozen:
-               dd_dev_info(dd, "State Frozen, requesting reset\n");
-               pci_disable_device(pdev);
-               ret = PCI_ERS_RESULT_NEED_RESET;
-               break;
-
-       case pci_channel_io_perm_failure:
-               if (dd) {
-                       dd_dev_info(dd, "State Permanent Failure, disabling\n");
-                       /* no more register accesses! */
-                       dd->flags &= ~HFI1_PRESENT;
-                       hfi1_disable_after_error(dd);
-               }
-                /* else early, or other problem */
-               ret =  PCI_ERS_RESULT_DISCONNECT;
-               break;
-
-       default: /* shouldn't happen */
-               dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
-                           state);
-               break;
-       }
-       return ret;
-}
-
-static pci_ers_result_t
-pci_mmio_enabled(struct pci_dev *pdev)
-{
-       u64 words = 0U;
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
-
-       if (dd && dd->pport) {
-               words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
-               if (words == ~0ULL)
-                       ret = PCI_ERS_RESULT_NEED_RESET;
-               dd_dev_info(dd,
-                           "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
-                           words, ret);
-       }
-       return  ret;
-}
-
-static pci_ers_result_t
-pci_slot_reset(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
-       return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static pci_ers_result_t
-pci_link_reset(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
-       return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static void
-pci_resume(struct pci_dev *pdev)
-{
-       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
-
-       dd_dev_info(dd, "HFI1 resume function called\n");
-       pci_cleanup_aer_uncorrect_error_status(pdev);
-       /*
-        * Running jobs will fail, since it's asynchronous
-        * unlike sysfs-requested reset.   Better than
-        * doing nothing.
-        */
-       hfi1_init(dd, 1); /* same as re-init after reset */
-}
-
-const struct pci_error_handlers hfi1_pci_err_handler = {
-       .error_detected = pci_error_detected,
-       .mmio_enabled = pci_mmio_enabled,
-       .link_reset = pci_link_reset,
-       .slot_reset = pci_slot_reset,
-       .resume = pci_resume,
-};
-
-/*============================================================================*/
-/* PCIe Gen3 support */
-
-/*
- * This code is separated out because it is expected to be removed in the
- * final shipping product.  If not, then it will be revisited and items
- * will be moved to more standard locations.
- */
-
-/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
-#define DL_STATUS_HFI0 0x1     /* hfi0 firmware download complete */
-#define DL_STATUS_HFI1 0x2     /* hfi1 firmware download complete */
-#define DL_STATUS_BOTH 0x3     /* hfi0 and hfi1 firmware download complete */
-
-/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
-#define DL_ERR_NONE            0x0     /* no error */
-#define DL_ERR_SWAP_PARITY     0x1     /* parity error in SerDes interrupt */
-                                       /*   or response data */
-#define DL_ERR_DISABLED        0x2     /* hfi disabled */
-#define DL_ERR_SECURITY        0x3     /* security check failed */
-#define DL_ERR_SBUS            0x4     /* SBus status error */
-#define DL_ERR_XFR_PARITY      0x5     /* parity error during ROM transfer*/
-
-/* gasket block secondary bus reset delay */
-#define SBR_DELAY_US 200000    /* 200ms */
-
-/* mask for PCIe capability register lnkctl2 target link speed */
-#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
-
-static uint pcie_target = 3;
-module_param(pcie_target, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
-
-static uint pcie_force;
-module_param(pcie_force, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
-
-static uint pcie_retry = 5;
-module_param(pcie_retry, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
-
-#define UNSET_PSET 255
-#define DEFAULT_DISCRETE_PSET 2        /* discrete HFI */
-#define DEFAULT_MCP_PSET 4     /* MCP HFI */
-static uint pcie_pset = UNSET_PSET;
-module_param(pcie_pset, uint, S_IRUGO);
-MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
-
-/* equalization columns */
-#define PREC 0
-#define ATTN 1
-#define POST 2
-
-/* discrete silicon preliminary equalization values */
-static const u8 discrete_preliminary_eq[11][3] = {
-       /* prec   attn   post */
-       {  0x00,  0x00,  0x12 },        /* p0 */
-       {  0x00,  0x00,  0x0c },        /* p1 */
-       {  0x00,  0x00,  0x0f },        /* p2 */
-       {  0x00,  0x00,  0x09 },        /* p3 */
-       {  0x00,  0x00,  0x00 },        /* p4 */
-       {  0x06,  0x00,  0x00 },        /* p5 */
-       {  0x09,  0x00,  0x00 },        /* p6 */
-       {  0x06,  0x00,  0x0f },        /* p7 */
-       {  0x09,  0x00,  0x09 },        /* p8 */
-       {  0x0c,  0x00,  0x00 },        /* p9 */
-       {  0x00,  0x00,  0x18 },        /* p10 */
-};
-
-/* integrated silicon preliminary equalization values */
-static const u8 integrated_preliminary_eq[11][3] = {
-       /* prec   attn   post */
-       {  0x00,  0x1e,  0x07 },        /* p0 */
-       {  0x00,  0x1e,  0x05 },        /* p1 */
-       {  0x00,  0x1e,  0x06 },        /* p2 */
-       {  0x00,  0x1e,  0x04 },        /* p3 */
-       {  0x00,  0x1e,  0x00 },        /* p4 */
-       {  0x03,  0x1e,  0x00 },        /* p5 */
-       {  0x04,  0x1e,  0x00 },        /* p6 */
-       {  0x03,  0x1e,  0x06 },        /* p7 */
-       {  0x03,  0x1e,  0x04 },        /* p8 */
-       {  0x05,  0x1e,  0x00 },        /* p9 */
-       {  0x00,  0x1e,  0x0a },        /* p10 */
-};
-
-/* helper to format the value to write to hardware */
-#define eq_value(pre, curr, post) \
-       ((((u32)(pre)) << \
-                       PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
-       | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
-       | (((u32)(post)) << \
-               PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
-
-/*
- * Load the given EQ preset table into the PCIe hardware.
- */
-static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
-                        u8 div)
-{
-       struct pci_dev *pdev = dd->pcidev;
-       u32 hit_error = 0;
-       u32 violation;
-       u32 i;
-       u8 c_minus1, c0, c_plus1;
-
-       for (i = 0; i < 11; i++) {
-               /* set index */
-               pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
-               /* write the value */
-               c_minus1 = eq[i][PREC] / div;
-               c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
-               c_plus1 = eq[i][POST] / div;
-               pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
-                                      eq_value(c_minus1, c0, c_plus1));
-               /* check if these coefficients violate EQ rules */
-               pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
-                                     &violation);
-               if (violation
-                   & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
-                       if (hit_error == 0) {
-                               dd_dev_err(dd,
-                                          "Gen3 EQ Table Coefficient rule violations\n");
-                               dd_dev_err(dd, "         prec   attn   post\n");
-                       }
-                       dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
-                                  i, (u32)eq[i][0], (u32)eq[i][1],
-                                  (u32)eq[i][2]);
-                       dd_dev_err(dd, "            %02x     %02x     %02x\n",
-                                  (u32)c_minus1, (u32)c0, (u32)c_plus1);
-                       hit_error = 1;
-               }
-       }
-       if (hit_error)
-               return -EINVAL;
-       return 0;
-}
-
-/*
- * Steps to be done after the PCIe firmware is downloaded and
- * before the SBR for the Pcie Gen3.
- * The SBus resource is already being held.
- */
-static void pcie_post_steps(struct hfi1_devdata *dd)
-{
-       int i;
-
-       set_sbus_fast_mode(dd);
-       /*
-        * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
-        * This avoids a spurious framing error that can otherwise be
-        * generated by the MAC layer.
-        *
-        * Use individual addresses since no broadcast is set up.
-        */
-       for (i = 0; i < NUM_PCIE_SERDES; i++) {
-               sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
-                            0x03, WRITE_SBUS_RECEIVER, 0x00022132);
-       }
-
-       clear_sbus_fast_mode(dd);
-}
-
-/*
- * Trigger a secondary bus reset (SBR) on ourselves using our parent.
- *
- * Based on pci_parent_bus_reset() which is not exported by the
- * kernel core.
- */
-static int trigger_sbr(struct hfi1_devdata *dd)
-{
-       struct pci_dev *dev = dd->pcidev;
-       struct pci_dev *pdev;
-
-       /* need a parent */
-       if (!dev->bus->self) {
-               dd_dev_err(dd, "%s: no parent device\n", __func__);
-               return -ENOTTY;
-       }
-
-       /* should not be anyone else on the bus */
-       list_for_each_entry(pdev, &dev->bus->devices, bus_list)
-               if (pdev != dev) {
-                       dd_dev_err(dd,
-                                  "%s: another device is on the same bus\n",
-                                  __func__);
-                       return -ENOTTY;
-               }
-
-       /*
-        * A secondary bus reset (SBR) issues a hot reset to our device.
-        * The following routine does a 1s wait after the reset is dropped
-        * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
-        * Conventional Reset, paragraph 3, line 35 also says that a 1s
-        * delay after a reset is required.  Per spec requirements,
-        * the link is either working or not after that point.
-        */
-       pci_reset_bridge_secondary_bus(dev->bus->self);
-
-       return 0;
-}
-
-/*
- * Write the given gasket interrupt register.
- */
-static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
-                                  u16 code, u16 data)
-{
-       write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
-                 (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
-                  ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
-}
-
-/*
- * Tell the gasket logic how to react to the reset.
- */
-static void arm_gasket_logic(struct hfi1_devdata *dd)
-{
-       u64 reg;
-
-       reg = (((u64)1 << dd->hfi1_id) <<
-              ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
-             ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
-              ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
-              ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
-              ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
-              ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
-       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
-       /* read back to push the write */
-       read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
-}
-
-/*
- * CCE_PCIE_CTRL long name helpers
- * We redefine these shorter macros to use in the code while leaving
- * chip_registers.h to be autogenerated from the hardware spec.
- */
-#define LANE_BUNDLE_MASK              CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
-#define LANE_BUNDLE_SHIFT             CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
-#define LANE_DELAY_MASK               CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
-#define LANE_DELAY_SHIFT              CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
-#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
-#define MARGIN_SHIFT                  CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
-#define MARGIN_G1_G2_OVERWRITE_MASK   CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
-#define MARGIN_G1_G2_OVERWRITE_SHIFT  CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
-#define MARGIN_GEN1_GEN2_MASK         CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
-#define MARGIN_GEN1_GEN2_SHIFT        CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
-
- /*
-  * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
-  */
-static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
-{
-       u64 pcie_ctrl;
-       u64 xmt_margin;
-       u64 xmt_margin_oe;
-       u64 lane_delay;
-       u64 lane_bundle;
-
-       pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
-
-       /*
-        * For Discrete, use full-swing.
-        *  - PCIe TX defaults to full-swing.
-        *    Leave this register as default.
-        * For Integrated, use half-swing
-        *  - Copy xmt_margin and xmt_margin_oe
-        *    from Gen1/Gen2 to Gen3.
-        */
-       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
-               /* extract initial fields */
-               xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
-                             & MARGIN_GEN1_GEN2_MASK;
-               xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
-                                & MARGIN_G1_G2_OVERWRITE_MASK;
-               lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
-               lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
-                              & LANE_BUNDLE_MASK;
-
-               /*
-                * For A0, EFUSE values are not set.  Override with the
-                * correct values.
-                */
-               if (is_ax(dd)) {
-                       /*
-                        * xmt_margin and OverwiteEnabel should be the
-                        * same for Gen1/Gen2 and Gen3
-                        */
-                       xmt_margin = 0x5;
-                       xmt_margin_oe = 0x1;
-                       lane_delay = 0xF; /* Delay 240ns. */
-                       lane_bundle = 0x0; /* Set to 1 lane. */
-               }
-
-               /* overwrite existing values */
-               pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
-                       | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
-                       | (xmt_margin << MARGIN_SHIFT)
-                       | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
-                       | (lane_delay << LANE_DELAY_SHIFT)
-                       | (lane_bundle << LANE_BUNDLE_SHIFT);
-
-               write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
-       }
-
-       dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
-                  fname, pcie_ctrl);
-}
-
-/*
- * Do all the steps needed to transition the PCIe link to Gen3 speed.
- */
-int do_pcie_gen3_transition(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-       u64 fw_ctrl;
-       u64 reg, therm;
-       u32 reg32, fs, lf;
-       u32 status, err;
-       int ret;
-       int do_retry, retry_count = 0;
-       uint default_pset;
-       u16 target_vector, target_speed;
-       u16 lnkctl2, vendor;
-       u8 div;
-       const u8 (*eq)[3];
-       int return_error = 0;
-
-       /* PCIe Gen3 is for the ASIC only */
-       if (dd->icode != ICODE_RTL_SILICON)
-               return 0;
-
-       if (pcie_target == 1) {                 /* target Gen1 */
-               target_vector = GEN1_SPEED_VECTOR;
-               target_speed = 2500;
-       } else if (pcie_target == 2) {          /* target Gen2 */
-               target_vector = GEN2_SPEED_VECTOR;
-               target_speed = 5000;
-       } else if (pcie_target == 3) {          /* target Gen3 */
-               target_vector = GEN3_SPEED_VECTOR;
-               target_speed = 8000;
-       } else {
-               /* off or invalid target - skip */
-               dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
-               return 0;
-       }
-
-       /* if already at target speed, done (unless forced) */
-       if (dd->lbus_speed == target_speed) {
-               dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
-                           pcie_target,
-                           pcie_force ? "re-doing anyway" : "skipping");
-               if (!pcie_force)
-                       return 0;
-       }
-
-       /*
-        * The driver cannot do the transition if it has no access to the
-        * upstream component
-        */
-       if (!parent) {
-               dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
-                           __func__);
-               return 0;
-       }
-
-       /*
-        * Do the Gen3 transition.  Steps are those of the PCIe Gen3
-        * recipe.
-        */
-
-       /* step 1: pcie link working in gen1/gen2 */
-
-       /* step 2: if either side is not capable of Gen3, done */
-       if (pcie_target == 3 && !dd->link_gen3_capable) {
-               dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
-               ret = -ENOSYS;
-               goto done_no_mutex;
-       }
-
-       /* hold the SBus resource across the firmware download and SBR */
-       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
-       if (ret) {
-               dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
-                          __func__);
-               return ret;
-       }
-
-       /* make sure thermal polling is not causing interrupts */
-       therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
-       if (therm) {
-               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
-               msleep(100);
-               dd_dev_info(dd, "%s: Disabled therm polling\n",
-                           __func__);
-       }
-
-retry:
-       /* the SBus download will reset the spico for thermal */
-
-       /* step 3: download SBus Master firmware */
-       /* step 4: download PCIe Gen3 SerDes firmware */
-       dd_dev_info(dd, "%s: downloading firmware\n", __func__);
-       ret = load_pcie_firmware(dd);
-       if (ret) {
-               /* do not proceed if the firmware cannot be downloaded */
-               return_error = 1;
-               goto done;
-       }
-
-       /* step 5: set up device parameter settings */
-       dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
-
-       /*
-        * PcieCfgSpcie1 - Link Control 3
-        * Leave at reset value.  No need to set PerfEq - link equalization
-        * will be performed automatically after the SBR when the target
-        * speed is 8GT/s.
-        */
-
-       /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
-
-       /* step 5a: Set Synopsys Port Logic registers */
-
-       /*
-        * PcieCfgRegPl2 - Port Force Link
-        *
-        * Set the low power field to 0x10 to avoid unnecessary power
-        * management messages.  All other fields are zero.
-        */
-       reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
-
-       /*
-        * PcieCfgRegPl100 - Gen3 Control
-        *
-        * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
-        * turn on PcieCfgRegPl100.EqEieosCnt
-        * Everything else zero.
-        */
-       reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
-
-       /*
-        * PcieCfgRegPl101 - Gen3 EQ FS and LF
-        * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
-        * PcieCfgRegPl103 - Gen3 EQ Preset Index
-        * PcieCfgRegPl105 - Gen3 EQ Status
-        *
-        * Give initial EQ settings.
-        */
-       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
-               /* 1000mV, FS=24, LF = 8 */
-               fs = 24;
-               lf = 8;
-               div = 3;
-               eq = discrete_preliminary_eq;
-               default_pset = DEFAULT_DISCRETE_PSET;
-       } else {
-               /* 400mV, FS=29, LF = 9 */
-               fs = 29;
-               lf = 9;
-               div = 1;
-               eq = integrated_preliminary_eq;
-               default_pset = DEFAULT_MCP_PSET;
-       }
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
-                              (fs <<
-                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
-                              (lf <<
-                               PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
-       ret = load_eq_table(dd, eq, fs, div);
-       if (ret)
-               goto done;
-
-       /*
-        * PcieCfgRegPl106 - Gen3 EQ Control
-        *
-        * Set Gen3EqPsetReqVec, leave other fields 0.
-        */
-       if (pcie_pset == UNSET_PSET)
-               pcie_pset = default_pset;
-       if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
-               dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
-                          __func__, pcie_pset, default_pset);
-               pcie_pset = default_pset;
-       }
-       dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
-                              ((1 << pcie_pset) <<
-                       PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
-                       PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
-                       PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
-
-       /*
-        * step 5b: Do post firmware download steps via SBus
-        */
-       dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
-       pcie_post_steps(dd);
-
-       /*
-        * step 5c: Program gasket interrupts
-        */
-       /* set the Rx Bit Rate to REFCLK ratio */
-       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
-       /* disable pCal for PCIe Gen3 RX equalization */
-       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
-       /*
-        * Enable iCal for PCIe Gen3 RX equalization, and set which
-        * evaluation of RX_EQ_EVAL will launch the iCal procedure.
-        */
-       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
-       /* terminate list */
-       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
-
-       /*
-        * step 5d: program XMT margin
-        */
-       write_xmt_margin(dd, __func__);
-
-       /*
-        * step 5e: disable active state power management (ASPM). It
-        * will be enabled if required later
-        */
-       dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
-       aspm_hw_disable_l1(dd);
-
-       /*
-        * step 5f: clear DirectSpeedChange
-        * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
-        * change in the speed target from starting before we are ready.
-        * This field defaults to 0 and we are not changing it, so nothing
-        * needs to be done.
-        */
-
-       /* step 5g: Set target link speed */
-       /*
-        * Set target link speed to be target on both device and parent.
-        * On setting the parent: Some system BIOSs "helpfully" set the
-        * parent target speed to Gen2 to match the ASIC's initial speed.
-        * We can set the target Gen3 because we have already checked
-        * that it is Gen3 capable earlier.
-        */
-       dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
-       pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
-       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       /* only write to parent if target is not as high as ours */
-       if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
-               lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
-               lnkctl2 |= target_vector;
-               dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-                           (u32)lnkctl2);
-               pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
-       } else {
-               dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
-       }
-
-       dd_dev_info(dd, "%s: setting target link speed\n", __func__);
-       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
-       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
-       lnkctl2 |= target_vector;
-       dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
-                   (u32)lnkctl2);
-       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
-
-       /* step 5h: arm gasket logic */
-       /* hold DC in reset across the SBR */
-       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
-       (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
-       /* save firmware control across the SBR */
-       fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
-
-       dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
-       arm_gasket_logic(dd);
-
-       /*
-        * step 6: quiesce PCIe link
-        * The chip has already been reset, so there will be no traffic
-        * from the chip.  Linux has no easy way to enforce that it will
-        * not try to access the device, so we just need to hope it doesn't
-        * do it while we are doing the reset.
-        */
-
-       /*
-        * step 7: initiate the secondary bus reset (SBR)
-        * step 8: hardware brings the links back up
-        * step 9: wait for link speed transition to be complete
-        */
-       dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
-       ret = trigger_sbr(dd);
-       if (ret)
-               goto done;
-
-       /* step 10: decide what to do next */
-
-       /* check if we can read PCI space */
-       ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
-       if (ret) {
-               dd_dev_info(dd,
-                           "%s: read of VendorID failed after SBR, err %d\n",
-                           __func__, ret);
-               return_error = 1;
-               goto done;
-       }
-       if (vendor == 0xffff) {
-               dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
-               return_error = 1;
-               ret = -EIO;
-               goto done;
-       }
-
-       /* restore PCI space registers we know were reset */
-       dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
-       restore_pci_variables(dd);
-       /* restore firmware control */
-       write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
-
-       /*
-        * Check the gasket block status.
-        *
-        * This is the first CSR read after the SBR.  If the read returns
-        * all 1s (fails), the link did not make it back.
-        *
-        * Once we're sure we can read and write, clear the DC reset after
-        * the SBR.  Then check for any per-lane errors. Then look over
-        * the status.
-        */
-       reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
-       dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
-       if (reg == ~0ull) {     /* PCIe read failed/timeout */
-               dd_dev_err(dd, "SBR failed - unable to read from device\n");
-               return_error = 1;
-               ret = -ENOSYS;
-               goto done;
-       }
-
-       /* clear the DC reset */
-       write_csr(dd, CCE_DC_CTRL, 0);
-
-       /* Set the LED off */
-       setextled(dd, 0);
-
-       /* check for any per-lane errors */
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
-       dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
-
-       /* extract status, look for our HFI */
-       status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
-                       & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
-       if ((status & (1 << dd->hfi1_id)) == 0) {
-               dd_dev_err(dd,
-                          "%s: gasket status 0x%x, expecting 0x%x\n",
-                          __func__, status, 1 << dd->hfi1_id);
-               ret = -EIO;
-               goto done;
-       }
-
-       /* extract error */
-       err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
-               & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
-       if (err) {
-               dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
-               ret = -EIO;
-               goto done;
-       }
-
-       /* update our link information cache */
-       update_lbus_info(dd);
-       dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
-                   dd->lbus_info);
-
-       if (dd->lbus_speed != target_speed) { /* not target */
-               /* maybe retry */
-               do_retry = retry_count < pcie_retry;
-               dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
-                          pcie_target, do_retry ? ", retrying" : "");
-               retry_count++;
-               if (do_retry) {
-                       msleep(100); /* allow time to settle */
-                       goto retry;
-               }
-               ret = -EIO;
-       }
-
-done:
-       if (therm) {
-               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
-               msleep(100);
-               dd_dev_info(dd, "%s: Re-enable therm polling\n",
-                           __func__);
-       }
-       release_chip_resource(dd, CR_SBUS);
-done_no_mutex:
-       /* return no error if it is OK to be at current speed */
-       if (ret && !return_error) {
-               dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
-               ret = 0;
-       }
-
-       dd_dev_info(dd, "%s: done\n", __func__);
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
deleted file mode 100644 (file)
index c67b9ad..0000000
+++ /dev/null
@@ -1,2073 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include "hfi.h"
-#include "qp.h"
-#include "trace.h"
-
-#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
-
-#define SC(name) SEND_CTXT_##name
-/*
- * Send Context functions
- */
-static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
-
-/*
- * Set the CM reset bit and wait for it to clear.  Use the provided
- * sendctrl register.  This routine has no locking.
- */
-void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
-{
-       write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
-       while (1) {
-               udelay(1);
-               sendctrl = read_csr(dd, SEND_CTRL);
-               if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
-                       break;
-       }
-}
-
-/* defined in header release 48 and higher */
-#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
-#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
-#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
-#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
-               << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
-#endif
-
-/* global control of PIO send */
-void pio_send_control(struct hfi1_devdata *dd, int op)
-{
-       u64 reg, mask;
-       unsigned long flags;
-       int write = 1;  /* write sendctrl back */
-       int flush = 0;  /* re-read sendctrl to make sure it is flushed */
-
-       spin_lock_irqsave(&dd->sendctrl_lock, flags);
-
-       reg = read_csr(dd, SEND_CTRL);
-       switch (op) {
-       case PSC_GLOBAL_ENABLE:
-               reg |= SEND_CTRL_SEND_ENABLE_SMASK;
-       /* Fall through */
-       case PSC_DATA_VL_ENABLE:
-               /* Disallow sending on VLs not enabled */
-               mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
-                               SEND_CTRL_UNSUPPORTED_VL_SHIFT;
-               reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
-               break;
-       case PSC_GLOBAL_DISABLE:
-               reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
-               break;
-       case PSC_GLOBAL_VLARB_ENABLE:
-               reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
-               break;
-       case PSC_GLOBAL_VLARB_DISABLE:
-               reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
-               break;
-       case PSC_CM_RESET:
-               __cm_reset(dd, reg);
-               write = 0; /* CSR already written (and flushed) */
-               break;
-       case PSC_DATA_VL_DISABLE:
-               reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
-               flush = 1;
-               break;
-       default:
-               dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
-               break;
-       }
-
-       if (write) {
-               write_csr(dd, SEND_CTRL, reg);
-               if (flush)
-                       (void)read_csr(dd, SEND_CTRL); /* flush write */
-       }
-
-       spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
-}
-
-/* number of send context memory pools */
-#define NUM_SC_POOLS 2
-
-/* Send Context Size (SCS) wildcards */
-#define SCS_POOL_0 -1
-#define SCS_POOL_1 -2
-
-/* Send Context Count (SCC) wildcards */
-#define SCC_PER_VL -1
-#define SCC_PER_CPU  -2
-#define SCC_PER_KRCVQ  -3
-
-/* Send Context Size (SCS) constants */
-#define SCS_ACK_CREDITS  32
-#define SCS_VL15_CREDITS 102   /* 3 pkts of 2048B data + 128B header */
-
-#define PIO_THRESHOLD_CEILING 4096
-
-#define PIO_WAIT_BATCH_SIZE 5
-
-/* default send context sizes */
-static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
-       [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
-                       .count = SCC_PER_VL },  /* one per NUMA */
-       [SC_ACK]    = { .size  = SCS_ACK_CREDITS,
-                       .count = SCC_PER_KRCVQ },
-       [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
-                       .count = SCC_PER_CPU }, /* one per CPU */
-       [SC_VL15]   = { .size  = SCS_VL15_CREDITS,
-                       .count = 1 },
-
-};
-
-/* send context memory pool configuration */
-struct mem_pool_config {
-       int centipercent;       /* % of memory, in 100ths of 1% */
-       int absolute_blocks;    /* absolute block count */
-};
-
-/* default memory pool configuration: 100% in pool 0 */
-static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
-       /* centi%, abs blocks */
-       {  10000,     -1 },             /* pool 0 */
-       {      0,     -1 },             /* pool 1 */
-};
-
-/* memory pool information, used when calculating final sizes */
-struct mem_pool_info {
-       int centipercent;       /*
-                                * 100th of 1% of memory to use, -1 if blocks
-                                * already set
-                                */
-       int count;              /* count of contexts in the pool */
-       int blocks;             /* block size of the pool */
-       int size;               /* context size, in blocks */
-};
-
-/*
- * Convert a pool wildcard to a valid pool index.  The wildcards
- * start at -1 and increase negatively.  Map them as:
- *     -1 => 0
- *     -2 => 1
- *     etc.
- *
- * Return -1 on non-wildcard input, otherwise convert to a pool number.
- */
-static int wildcard_to_pool(int wc)
-{
-       if (wc >= 0)
-               return -1;      /* non-wildcard */
-       return -wc - 1;
-}
-
-static const char *sc_type_names[SC_MAX] = {
-       "kernel",
-       "ack",
-       "user",
-       "vl15"
-};
-
-static const char *sc_type_name(int index)
-{
-       if (index < 0 || index >= SC_MAX)
-               return "unknown";
-       return sc_type_names[index];
-}
-
-/*
- * Read the send context memory pool configuration and send context
- * size configuration.  Replace any wildcards and come up with final
- * counts and sizes for the send context types.
- */
-int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
-{
-       struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
-       int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
-       int total_contexts = 0;
-       int fixed_blocks;
-       int pool_blocks;
-       int used_blocks;
-       int cp_total;           /* centipercent total */
-       int ab_total;           /* absolute block total */
-       int extra;
-       int i;
-
-       /*
-        * When SDMA is enabled, kernel context pio packet size is capped by
-        * "piothreshold". Reduce pio buffer allocation for kernel context by
-        * setting it to a fixed size. The allocation allows 3-deep buffering
-        * of the largest pio packets plus up to 128 bytes header, sufficient
-        * to maintain verbs performance.
-        *
-        * When SDMA is disabled, keep the default pooling allocation.
-        */
-       if (HFI1_CAP_IS_KSET(SDMA)) {
-               u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
-                                        piothreshold : PIO_THRESHOLD_CEILING;
-               sc_config_sizes[SC_KERNEL].size =
-                       3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
-       }
-
-       /*
-        * Step 0:
-        *      - copy the centipercents/absolute sizes from the pool config
-        *      - sanity check these values
-        *      - add up centipercents, then later check for full value
-        *      - add up absolute blocks, then later check for over-commit
-        */
-       cp_total = 0;
-       ab_total = 0;
-       for (i = 0; i < NUM_SC_POOLS; i++) {
-               int cp = sc_mem_pool_config[i].centipercent;
-               int ab = sc_mem_pool_config[i].absolute_blocks;
-
-               /*
-                * A negative value is "unused" or "invalid".  Both *can*
-                * be valid, but centipercent wins, so check that first
-                */
-               if (cp >= 0) {                  /* centipercent valid */
-                       cp_total += cp;
-               } else if (ab >= 0) {           /* absolute blocks valid */
-                       ab_total += ab;
-               } else {                        /* neither valid */
-                       dd_dev_err(
-                               dd,
-                               "Send context memory pool %d: both the block count and centipercent are invalid\n",
-                               i);
-                       return -EINVAL;
-               }
-
-               mem_pool_info[i].centipercent = cp;
-               mem_pool_info[i].blocks = ab;
-       }
-
-       /* do not use both % and absolute blocks for different pools */
-       if (cp_total != 0 && ab_total != 0) {
-               dd_dev_err(
-                       dd,
-                       "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
-               return -EINVAL;
-       }
-
-       /* if any percentages are present, they must add up to 100% x 100 */
-       if (cp_total != 0 && cp_total != 10000) {
-               dd_dev_err(
-                       dd,
-                       "Send context memory pool centipercent is %d, expecting 10000\n",
-                       cp_total);
-               return -EINVAL;
-       }
-
-       /* the absolute pool total cannot be more than the mem total */
-       if (ab_total > total_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context memory pool absolute block count %d is larger than the memory size %d\n",
-                       ab_total, total_blocks);
-               return -EINVAL;
-       }
-
-       /*
-        * Step 2:
-        *      - copy from the context size config
-        *      - replace context type wildcard counts with real values
-        *      - add up non-memory pool block sizes
-        *      - add up memory pool user counts
-        */
-       fixed_blocks = 0;
-       for (i = 0; i < SC_MAX; i++) {
-               int count = sc_config_sizes[i].count;
-               int size = sc_config_sizes[i].size;
-               int pool;
-
-               /*
-                * Sanity check count: Either a positive value or
-                * one of the expected wildcards is valid.  The positive
-                * value is checked later when we compare against total
-                * memory available.
-                */
-               if (i == SC_ACK) {
-                       count = dd->n_krcv_queues;
-               } else if (i == SC_KERNEL) {
-                       count = INIT_SC_PER_VL * num_vls;
-               } else if (count == SCC_PER_CPU) {
-                       count = dd->num_rcv_contexts - dd->n_krcv_queues;
-               } else if (count < 0) {
-                       dd_dev_err(
-                               dd,
-                               "%s send context invalid count wildcard %d\n",
-                               sc_type_name(i), count);
-                       return -EINVAL;
-               }
-               if (total_contexts + count > dd->chip_send_contexts)
-                       count = dd->chip_send_contexts - total_contexts;
-
-               total_contexts += count;
-
-               /*
-                * Sanity check pool: The conversion will return a pool
-                * number or -1 if a fixed (non-negative) value.  The fixed
-                * value is checked later when we compare against
-                * total memory available.
-                */
-               pool = wildcard_to_pool(size);
-               if (pool == -1) {                       /* non-wildcard */
-                       fixed_blocks += size * count;
-               } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
-                       mem_pool_info[pool].count += count;
-               } else {                                /* invalid wildcard */
-                       dd_dev_err(
-                               dd,
-                               "%s send context invalid pool wildcard %d\n",
-                               sc_type_name(i), size);
-                       return -EINVAL;
-               }
-
-               dd->sc_sizes[i].count = count;
-               dd->sc_sizes[i].size = size;
-       }
-       if (fixed_blocks > total_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context fixed block count, %u, larger than total block count %u\n",
-                       fixed_blocks, total_blocks);
-               return -EINVAL;
-       }
-
-       /* step 3: calculate the blocks in the pools, and pool context sizes */
-       pool_blocks = total_blocks - fixed_blocks;
-       if (ab_total > pool_blocks) {
-               dd_dev_err(
-                       dd,
-                       "Send context fixed pool sizes, %u, larger than pool block count %u\n",
-                       ab_total, pool_blocks);
-               return -EINVAL;
-       }
-       /* subtract off the fixed pool blocks */
-       pool_blocks -= ab_total;
-
-       for (i = 0; i < NUM_SC_POOLS; i++) {
-               struct mem_pool_info *pi = &mem_pool_info[i];
-
-               /* % beats absolute blocks */
-               if (pi->centipercent >= 0)
-                       pi->blocks = (pool_blocks * pi->centipercent) / 10000;
-
-               if (pi->blocks == 0 && pi->count != 0) {
-                       dd_dev_err(
-                               dd,
-                               "Send context memory pool %d has %u contexts, but no blocks\n",
-                               i, pi->count);
-                       return -EINVAL;
-               }
-               if (pi->count == 0) {
-                       /* warn about wasted blocks */
-                       if (pi->blocks != 0)
-                               dd_dev_err(
-                                       dd,
-                                       "Send context memory pool %d has %u blocks, but zero contexts\n",
-                                       i, pi->blocks);
-                       pi->size = 0;
-               } else {
-                       pi->size = pi->blocks / pi->count;
-               }
-       }
-
-       /* step 4: fill in the context type sizes from the pool sizes */
-       used_blocks = 0;
-       for (i = 0; i < SC_MAX; i++) {
-               if (dd->sc_sizes[i].size < 0) {
-                       unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
-
-                       WARN_ON_ONCE(pool >= NUM_SC_POOLS);
-                       dd->sc_sizes[i].size = mem_pool_info[pool].size;
-               }
-               /* make sure we are not larger than what is allowed by the HW */
-#define PIO_MAX_BLOCKS 1024
-               if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
-                       dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
-
-               /* calculate our total usage */
-               used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
-       }
-       extra = total_blocks - used_blocks;
-       if (extra != 0)
-               dd_dev_info(dd, "unused send context blocks: %d\n", extra);
-
-       return total_contexts;
-}
-
-int init_send_contexts(struct hfi1_devdata *dd)
-{
-       u16 base;
-       int ret, i, j, context;
-
-       ret = init_credit_return(dd);
-       if (ret)
-               return ret;
-
-       dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
-                                       GFP_KERNEL);
-       dd->send_contexts = kcalloc(dd->num_send_contexts,
-                                       sizeof(struct send_context_info),
-                                       GFP_KERNEL);
-       if (!dd->send_contexts || !dd->hw_to_sw) {
-               kfree(dd->hw_to_sw);
-               kfree(dd->send_contexts);
-               free_credit_return(dd);
-               return -ENOMEM;
-       }
-
-       /* hardware context map starts with invalid send context indices */
-       for (i = 0; i < TXE_NUM_CONTEXTS; i++)
-               dd->hw_to_sw[i] = INVALID_SCI;
-
-       /*
-        * All send contexts have their credit sizes.  Allocate credits
-        * for each context one after another from the global space.
-        */
-       context = 0;
-       base = 1;
-       for (i = 0; i < SC_MAX; i++) {
-               struct sc_config_sizes *scs = &dd->sc_sizes[i];
-
-               for (j = 0; j < scs->count; j++) {
-                       struct send_context_info *sci =
-                                               &dd->send_contexts[context];
-                       sci->type = i;
-                       sci->base = base;
-                       sci->credits = scs->size;
-
-                       context++;
-                       base += scs->size;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * Allocate a software index and hardware context of the given type.
- *
- * Must be called with dd->sc_lock held.
- */
-static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
-                      u32 *hw_context)
-{
-       struct send_context_info *sci;
-       u32 index;
-       u32 context;
-
-       for (index = 0, sci = &dd->send_contexts[0];
-                       index < dd->num_send_contexts; index++, sci++) {
-               if (sci->type == type && sci->allocated == 0) {
-                       sci->allocated = 1;
-                       /* use a 1:1 mapping, but make them non-equal */
-                       context = dd->chip_send_contexts - index - 1;
-                       dd->hw_to_sw[context] = index;
-                       *sw_index = index;
-                       *hw_context = context;
-                       return 0; /* success */
-               }
-       }
-       dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
-       return -ENOSPC;
-}
-
-/*
- * Free the send context given by its software index.
- *
- * Must be called with dd->sc_lock held.
- */
-static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
-{
-       struct send_context_info *sci;
-
-       sci = &dd->send_contexts[sw_index];
-       if (!sci->allocated) {
-               dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
-                          __func__, sw_index, hw_context);
-       }
-       sci->allocated = 0;
-       dd->hw_to_sw[hw_context] = INVALID_SCI;
-}
-
-/* return the base context of a context in a group */
-static inline u32 group_context(u32 context, u32 group)
-{
-       return (context >> group) << group;
-}
-
-/* return the size of a group */
-static inline u32 group_size(u32 group)
-{
-       return 1 << group;
-}
-
-/*
- * Obtain the credit return addresses, kernel virtual and physical, for the
- * given sc.
- *
- * To understand this routine:
- * o va and pa are arrays of struct credit_return.  One for each physical
- *   send context, per NUMA.
- * o Each send context always looks in its relative location in a struct
- *   credit_return for its credit return.
- * o Each send context in a group must have its return address CSR programmed
- *   with the same value.  Use the address of the first send context in the
- *   group.
- */
-static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
-{
-       u32 gc = group_context(sc->hw_context, sc->group);
-       u32 index = sc->hw_context & 0x7;
-
-       sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
-       *pa = (unsigned long)
-              &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
-}
-
-/*
- * Work queue function triggered in error interrupt routine for
- * kernel contexts.
- */
-static void sc_halted(struct work_struct *work)
-{
-       struct send_context *sc;
-
-       sc = container_of(work, struct send_context, halt_work);
-       sc_restart(sc);
-}
-
-/*
- * Calculate PIO block threshold for this send context using the given MTU.
- * Trigger a return when one MTU plus optional header of credits remain.
- *
- * Parameter mtu is in bytes.
- * Parameter hdrqentsize is in DWORDs.
- *
- * Return value is what to write into the CSR: trigger return when
- * unreturned credits pass this count.
- */
-u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
-{
-       u32 release_credits;
-       u32 threshold;
-
-       /* add in the header size, then divide by the PIO block size */
-       mtu += hdrqentsize << 2;
-       release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
-
-       /* check against this context's credits */
-       if (sc->credits <= release_credits)
-               threshold = 1;
-       else
-               threshold = sc->credits - release_credits;
-
-       return threshold;
-}
-
-/*
- * Calculate credit threshold in terms of percent of the allocated credits.
- * Trigger when unreturned credits equal or exceed the percentage of the whole.
- *
- * Return value is what to write into the CSR: trigger return when
- * unreturned credits pass this count.
- */
-u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
-{
-       return (sc->credits * percent) / 100;
-}
-
-/*
- * Set the credit return threshold.
- */
-void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
-{
-       unsigned long flags;
-       u32 old_threshold;
-       int force_return = 0;
-
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-
-       old_threshold = (sc->credit_ctrl >>
-                               SC(CREDIT_CTRL_THRESHOLD_SHIFT))
-                        & SC(CREDIT_CTRL_THRESHOLD_MASK);
-
-       if (new_threshold != old_threshold) {
-               sc->credit_ctrl =
-                       (sc->credit_ctrl
-                               & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
-                       | ((new_threshold
-                               & SC(CREDIT_CTRL_THRESHOLD_MASK))
-                          << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-
-               /* force a credit return on change to avoid a possible stall */
-               force_return = 1;
-       }
-
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-
-       if (force_return)
-               sc_return_credits(sc);
-}
-
-/*
- * set_pio_integrity
- *
- * Set the CHECK_ENABLE register for the send context 'sc'.
- */
-void set_pio_integrity(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg = 0;
-       u32 hw_context = sc->hw_context;
-       int type = sc->type;
-
-       /*
-        * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
-        * we're snooping.
-        */
-       if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
-           dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
-               reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
-
-       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
-}
-
-static u32 get_buffers_allocated(struct send_context *sc)
-{
-       int cpu;
-       u32 ret = 0;
-
-       for_each_possible_cpu(cpu)
-               ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
-       return ret;
-}
-
-static void reset_buffers_allocated(struct send_context *sc)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
-}
-
-/*
- * Allocate a NUMA relative send context structure of the given type along
- * with a HW context.
- */
-struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
-                             uint hdrqentsize, int numa)
-{
-       struct send_context_info *sci;
-       struct send_context *sc = NULL;
-       dma_addr_t pa;
-       unsigned long flags;
-       u64 reg;
-       u32 thresh;
-       u32 sw_index;
-       u32 hw_context;
-       int ret;
-       u8 opval, opmask;
-
-       /* do not allocate while frozen */
-       if (dd->flags & HFI1_FROZEN)
-               return NULL;
-
-       sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
-       if (!sc)
-               return NULL;
-
-       sc->buffers_allocated = alloc_percpu(u32);
-       if (!sc->buffers_allocated) {
-               kfree(sc);
-               dd_dev_err(dd,
-                          "Cannot allocate buffers_allocated per cpu counters\n"
-                         );
-               return NULL;
-       }
-
-       spin_lock_irqsave(&dd->sc_lock, flags);
-       ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
-       if (ret) {
-               spin_unlock_irqrestore(&dd->sc_lock, flags);
-               free_percpu(sc->buffers_allocated);
-               kfree(sc);
-               return NULL;
-       }
-
-       sci = &dd->send_contexts[sw_index];
-       sci->sc = sc;
-
-       sc->dd = dd;
-       sc->node = numa;
-       sc->type = type;
-       spin_lock_init(&sc->alloc_lock);
-       spin_lock_init(&sc->release_lock);
-       spin_lock_init(&sc->credit_ctrl_lock);
-       INIT_LIST_HEAD(&sc->piowait);
-       INIT_WORK(&sc->halt_work, sc_halted);
-       init_waitqueue_head(&sc->halt_wait);
-
-       /* grouping is always single context for now */
-       sc->group = 0;
-
-       sc->sw_index = sw_index;
-       sc->hw_context = hw_context;
-       cr_group_addresses(sc, &pa);
-       sc->credits = sci->credits;
-
-/* PIO Send Memory Address details */
-#define PIO_ADDR_CONTEXT_MASK 0xfful
-#define PIO_ADDR_CONTEXT_SHIFT 16
-       sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
-                                       << PIO_ADDR_CONTEXT_SHIFT);
-
-       /* set base and credits */
-       reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
-                                       << SC(CTRL_CTXT_DEPTH_SHIFT))
-               | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
-                                       << SC(CTRL_CTXT_BASE_SHIFT));
-       write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
-
-       set_pio_integrity(sc);
-
-       /* unmask all errors */
-       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
-
-       /* set the default partition key */
-       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
-                       (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
-                        DEFAULT_PKEY) <<
-                       SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
-
-       /* per context type checks */
-       if (type == SC_USER) {
-               opval = USER_OPCODE_CHECK_VAL;
-               opmask = USER_OPCODE_CHECK_MASK;
-       } else {
-               opval = OPCODE_CHECK_VAL_DISABLED;
-               opmask = OPCODE_CHECK_MASK_DISABLED;
-       }
-
-       /* set the send context check opcode mask and value */
-       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
-                       ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
-                       ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
-
-       /* set up credit return */
-       reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
-
-       /*
-        * Calculate the initial credit return threshold.
-        *
-        * For Ack contexts, set a threshold for half the credits.
-        * For User contexts use the given percentage.  This has been
-        * sanitized on driver start-up.
-        * For Kernel contexts, use the default MTU plus a header
-        * or half the credits, whichever is smaller. This should
-        * work for both the 3-deep buffering allocation and the
-        * pooling allocation.
-        */
-       if (type == SC_ACK) {
-               thresh = sc_percent_to_threshold(sc, 50);
-       } else if (type == SC_USER) {
-               thresh = sc_percent_to_threshold(sc,
-                                                user_credit_return_threshold);
-       } else { /* kernel */
-               thresh = min(sc_percent_to_threshold(sc, 50),
-                            sc_mtu_to_threshold(sc, hfi1_max_mtu,
-                                                hdrqentsize));
-       }
-       reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
-       /* add in early return */
-       if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
-               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
-       else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
-               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
-
-       /* set up write-through credit_ctrl */
-       sc->credit_ctrl = reg;
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
-
-       /* User send contexts should not allow sending on VL15 */
-       if (type == SC_USER) {
-               reg = 1ULL << 15;
-               write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
-       }
-
-       spin_unlock_irqrestore(&dd->sc_lock, flags);
-
-       /*
-        * Allocate shadow ring to track outstanding PIO buffers _after_
-        * unlocking.  We don't know the size until the lock is held and
-        * we can't allocate while the lock is held.  No one is using
-        * the context yet, so allocate it now.
-        *
-        * User contexts do not get a shadow ring.
-        */
-       if (type != SC_USER) {
-               /*
-                * Size the shadow ring 1 larger than the number of credits
-                * so head == tail can mean empty.
-                */
-               sc->sr_size = sci->credits + 1;
-               sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
-                               sc->sr_size, GFP_KERNEL, numa);
-               if (!sc->sr) {
-                       sc_free(sc);
-                       return NULL;
-               }
-       }
-
-       hfi1_cdbg(PIO,
-                 "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
-                 sw_index,
-                 hw_context,
-                 sc_type_name(type),
-                 sc->group,
-                 sc->credits,
-                 sc->credit_ctrl,
-                 thresh);
-
-       return sc;
-}
-
-/* free a per-NUMA send context structure */
-void sc_free(struct send_context *sc)
-{
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       u32 sw_index;
-       u32 hw_context;
-
-       if (!sc)
-               return;
-
-       sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
-       dd = sc->dd;
-       if (!list_empty(&sc->piowait))
-               dd_dev_err(dd, "piowait list not empty!\n");
-       sw_index = sc->sw_index;
-       hw_context = sc->hw_context;
-       sc_disable(sc); /* make sure the HW is disabled */
-       flush_work(&sc->halt_work);
-
-       spin_lock_irqsave(&dd->sc_lock, flags);
-       dd->send_contexts[sw_index].sc = NULL;
-
-       /* clear/disable all registers set in sc_alloc */
-       write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
-       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
-       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
-       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
-
-       /* release the index and context for re-use */
-       sc_hw_free(dd, sw_index, hw_context);
-       spin_unlock_irqrestore(&dd->sc_lock, flags);
-
-       kfree(sc->sr);
-       free_percpu(sc->buffers_allocated);
-       kfree(sc);
-}
-
-/* disable the context */
-void sc_disable(struct send_context *sc)
-{
-       u64 reg;
-       unsigned long flags;
-       struct pio_buf *pbuf;
-
-       if (!sc)
-               return;
-
-       /* do all steps, even if already disabled */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
-       reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
-       sc->flags &= ~SCF_ENABLED;
-       sc_wait_for_packet_egress(sc, 1);
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       /*
-        * Flush any waiters.  Once the context is disabled,
-        * credit return interrupts are stopped (although there
-        * could be one in-process when the context is disabled).
-        * Wait one microsecond for any lingering interrupts, then
-        * proceed with the flush.
-        */
-       udelay(1);
-       spin_lock_irqsave(&sc->release_lock, flags);
-       if (sc->sr) {   /* this context has a shadow ring */
-               while (sc->sr_tail != sc->sr_head) {
-                       pbuf = &sc->sr[sc->sr_tail].pbuf;
-                       if (pbuf->cb)
-                               (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
-                       sc->sr_tail++;
-                       if (sc->sr_tail >= sc->sr_size)
-                               sc->sr_tail = 0;
-               }
-       }
-       spin_unlock_irqrestore(&sc->release_lock, flags);
-}
-
-/* return SendEgressCtxtStatus.PacketOccupancy */
-#define packet_occupancy(r) \
-       (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
-       >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
-
-/* is egress halted on the context? */
-#define egress_halted(r) \
-       ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
-
-/* wait for packet egress, optionally pause for credit return  */
-static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg = 0;
-       u64 reg_prev;
-       u32 loop = 0;
-
-       while (1) {
-               reg_prev = reg;
-               reg = read_csr(dd, sc->hw_context * 8 +
-                              SEND_EGRESS_CTXT_STATUS);
-               /* done if egress is stopped */
-               if (egress_halted(reg))
-                       break;
-               reg = packet_occupancy(reg);
-               if (reg == 0)
-                       break;
-               /* counter is reset if occupancy count changes */
-               if (reg != reg_prev)
-                       loop = 0;
-               if (loop > 500) {
-                       /* timed out - bounce the link */
-                       dd_dev_err(dd,
-                                  "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
-                                  __func__, sc->sw_index,
-                                  sc->hw_context, (u32)reg);
-                       queue_work(dd->pport->hfi1_wq,
-                                  &dd->pport->link_bounce_work);
-                       break;
-               }
-               loop++;
-               udelay(1);
-       }
-
-       if (pause)
-               /* Add additional delay to ensure chip returns all credits */
-               pause_for_credit_return(dd);
-}
-
-void sc_wait(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               struct send_context *sc = dd->send_contexts[i].sc;
-
-               if (!sc)
-                       continue;
-               sc_wait_for_packet_egress(sc, 0);
-       }
-}
-
-/*
- * Restart a context after it has been halted due to error.
- *
- * If the first step fails - wait for the halt to be asserted, return early.
- * Otherwise complain about timeouts but keep going.
- *
- * It is expected that allocations (enabled flag bit) have been shut off
- * already (only applies to kernel contexts).
- */
-int sc_restart(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       u64 reg;
-       u32 loop;
-       int count;
-
-       /* bounce off if not halted, or being free'd */
-       if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
-               return -EINVAL;
-
-       dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
-                   sc->hw_context);
-
-       /*
-        * Step 1: Wait for the context to actually halt.
-        *
-        * The error interrupt is asynchronous to actually setting halt
-        * on the context.
-        */
-       loop = 0;
-       while (1) {
-               reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
-               if (reg & SC(STATUS_CTXT_HALTED_SMASK))
-                       break;
-               if (loop > 100) {
-                       dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
-                                  __func__, sc->sw_index, sc->hw_context);
-                       return -ETIME;
-               }
-               loop++;
-               udelay(1);
-       }
-
-       /*
-        * Step 2: Ensure no users are still trying to write to PIO.
-        *
-        * For kernel contexts, we have already turned off buffer allocation.
-        * Now wait for the buffer count to go to zero.
-        *
-        * For user contexts, the user handling code has cut off write access
-        * to the context's PIO pages before calling this routine and will
-        * restore write access after this routine returns.
-        */
-       if (sc->type != SC_USER) {
-               /* kernel context */
-               loop = 0;
-               while (1) {
-                       count = get_buffers_allocated(sc);
-                       if (count == 0)
-                               break;
-                       if (loop > 100) {
-                               dd_dev_err(dd,
-                                          "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
-                                          __func__, sc->sw_index,
-                                          sc->hw_context, count);
-                       }
-                       loop++;
-                       udelay(1);
-               }
-       }
-
-       /*
-        * Step 3: Wait for all packets to egress.
-        * This is done while disabling the send context
-        *
-        * Step 4: Disable the context
-        *
-        * This is a superset of the halt.  After the disable, the
-        * errors can be cleared.
-        */
-       sc_disable(sc);
-
-       /*
-        * Step 5: Enable the context
-        *
-        * This enable will clear the halted flag and per-send context
-        * error flags.
-        */
-       return sc_enable(sc);
-}
-
-/*
- * PIO freeze processing.  To be called after the TXE block is fully frozen.
- * Go through all frozen send contexts and disable them.  The contexts are
- * already stopped by the freeze.
- */
-void pio_freeze(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               /*
-                * Don't disable unallocated, unfrozen, or user send contexts.
-                * User send contexts will be disabled when the process
-                * calls into the driver to reset its context.
-                */
-               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
-                       continue;
-
-               /* only need to disable, the context is already stopped */
-               sc_disable(sc);
-       }
-}
-
-/*
- * Unfreeze PIO for kernel send contexts.  The precondition for calling this
- * is that all PIO send contexts have been disabled and the SPC freeze has
- * been cleared.  Now perform the last step and re-enable each kernel context.
- * User (PSM) processing will occur when PSM calls into the kernel to
- * acknowledge the freeze.
- */
-void pio_kernel_unfreeze(struct hfi1_devdata *dd)
-{
-       struct send_context *sc;
-       int i;
-
-       for (i = 0; i < dd->num_send_contexts; i++) {
-               sc = dd->send_contexts[i].sc;
-               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
-                       continue;
-
-               sc_enable(sc);  /* will clear the sc frozen flag */
-       }
-}
-
-/*
- * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
- * Returns:
- *     -ETIMEDOUT - if we wait too long
- *     -EIO       - if there was an error
- */
-static int pio_init_wait_progress(struct hfi1_devdata *dd)
-{
-       u64 reg;
-       int max, count = 0;
-
-       /* max is the longest possible HW init time / delay */
-       max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
-       while (1) {
-               reg = read_csr(dd, SEND_PIO_INIT_CTXT);
-               if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
-                       break;
-               if (count >= max)
-                       return -ETIMEDOUT;
-               udelay(5);
-               count++;
-       }
-
-       return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
-}
-
-/*
- * Reset all of the send contexts to their power-on state.  Used
- * only during manual init - no lock against sc_enable needed.
- */
-void pio_reset_all(struct hfi1_devdata *dd)
-{
-       int ret;
-
-       /* make sure the init engine is not busy */
-       ret = pio_init_wait_progress(dd);
-       /* ignore any timeout */
-       if (ret == -EIO) {
-               /* clear the error */
-               write_csr(dd, SEND_PIO_ERR_CLEAR,
-                         SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
-       }
-
-       /* reset init all */
-       write_csr(dd, SEND_PIO_INIT_CTXT,
-                 SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
-       udelay(2);
-       ret = pio_init_wait_progress(dd);
-       if (ret < 0) {
-               dd_dev_err(dd,
-                          "PIO send context init %s while initializing all PIO blocks\n",
-                          ret == -ETIMEDOUT ? "is stuck" : "had an error");
-       }
-}
-
-/* enable the context */
-int sc_enable(struct send_context *sc)
-{
-       u64 sc_ctrl, reg, pio;
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-       int ret = 0;
-
-       if (!sc)
-               return -EINVAL;
-       dd = sc->dd;
-
-       /*
-        * Obtain the allocator lock to guard against any allocation
-        * attempts (which should not happen prior to context being
-        * enabled). On the release/disable side we don't need to
-        * worry about locking since the releaser will not do anything
-        * if the context accounting values have not changed.
-        */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
-       if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
-               goto unlock; /* already enabled */
-
-       /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
-
-       *sc->hw_free = 0;
-       sc->free = 0;
-       sc->alloc_free = 0;
-       sc->fill = 0;
-       sc->sr_head = 0;
-       sc->sr_tail = 0;
-       sc->flags = 0;
-       /* the alloc lock insures no fast path allocation */
-       reset_buffers_allocated(sc);
-
-       /*
-        * Clear all per-context errors.  Some of these will be set when
-        * we are re-enabling after a context halt.  Now that the context
-        * is disabled, the halt will not clear until after the PIO init
-        * engine runs below.
-        */
-       reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
-       if (reg)
-               write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
-
-       /*
-        * The HW PIO initialization engine can handle only one init
-        * request at a time. Serialize access to each device's engine.
-        */
-       spin_lock(&dd->sc_init_lock);
-       /*
-        * Since access to this code block is serialized and
-        * each access waits for the initialization to complete
-        * before releasing the lock, the PIO initialization engine
-        * should not be in use, so we don't have to wait for the
-        * InProgress bit to go down.
-        */
-       pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
-              SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
-               SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
-       write_csr(dd, SEND_PIO_INIT_CTXT, pio);
-       /*
-        * Wait until the engine is done.  Give the chip the required time
-        * so, hopefully, we read the register just once.
-        */
-       udelay(2);
-       ret = pio_init_wait_progress(dd);
-       spin_unlock(&dd->sc_init_lock);
-       if (ret) {
-               dd_dev_err(dd,
-                          "sctxt%u(%u): Context not enabled due to init failure %d\n",
-                          sc->sw_index, sc->hw_context, ret);
-               goto unlock;
-       }
-
-       /*
-        * All is well. Enable the context.
-        */
-       sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
-       write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
-       /*
-        * Read SendCtxtCtrl to force the write out and prevent a timing
-        * hazard where a PIO write may reach the context before the enable.
-        */
-       read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
-       sc->flags |= SCF_ENABLED;
-
-unlock:
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       return ret;
-}
-
-/* force a credit return on the context */
-void sc_return_credits(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       /* a 0->1 transition schedules a credit return */
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
-                       SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
-       /*
-        * Ensure that the write is flushed and the credit return is
-        * scheduled. We care more about the 0 -> 1 transition.
-        */
-       read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
-       /* set back to 0 for next time */
-       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
-}
-
-/* allow all in-flight packets to drain on the context */
-void sc_flush(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       sc_wait_for_packet_egress(sc, 1);
-}
-
-/* drop all packets on the context, no waiting until they are sent */
-void sc_drop(struct send_context *sc)
-{
-       if (!sc)
-               return;
-
-       dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
-                   __func__, sc->sw_index, sc->hw_context);
-}
-
-/*
- * Start the software reaction to a context halt or SPC freeze:
- *     - mark the context as halted or frozen
- *     - stop buffer allocations
- *
- * Called from the error interrupt.  Other work is deferred until
- * out of the interrupt.
- */
-void sc_stop(struct send_context *sc, int flag)
-{
-       unsigned long flags;
-
-       /* mark the context */
-       sc->flags |= flag;
-
-       /* stop buffer allocations */
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       sc->flags &= ~SCF_ENABLED;
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-       wake_up(&sc->halt_wait);
-}
-
-#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
-#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
-
-/*
- * The send context buffer "allocator".
- *
- * @sc: the PIO send context we are allocating from
- * @len: length of whole packet - including PBC - in dwords
- * @cb: optional callback to call when the buffer is finished sending
- * @arg: argument for cb
- *
- * Return a pointer to a PIO buffer if successful, NULL if not enough room.
- */
-struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
-                               pio_release_cb cb, void *arg)
-{
-       struct pio_buf *pbuf = NULL;
-       unsigned long flags;
-       unsigned long avail;
-       unsigned long blocks = dwords_to_blocks(dw_len);
-       unsigned long start_fill;
-       int trycount = 0;
-       u32 head, next;
-
-       spin_lock_irqsave(&sc->alloc_lock, flags);
-       if (!(sc->flags & SCF_ENABLED)) {
-               spin_unlock_irqrestore(&sc->alloc_lock, flags);
-               goto done;
-       }
-
-retry:
-       avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
-       if (blocks > avail) {
-               /* not enough room */
-               if (unlikely(trycount)) { /* already tried to get more room */
-                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-                       goto done;
-               }
-               /* copy from receiver cache line and recalculate */
-               sc->alloc_free = ACCESS_ONCE(sc->free);
-               avail =
-                       (unsigned long)sc->credits -
-                       (sc->fill - sc->alloc_free);
-               if (blocks > avail) {
-                       /* still no room, actively update */
-                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-                       sc_release_update(sc);
-                       spin_lock_irqsave(&sc->alloc_lock, flags);
-                       sc->alloc_free = ACCESS_ONCE(sc->free);
-                       trycount++;
-                       goto retry;
-               }
-       }
-
-       /* there is enough room */
-
-       preempt_disable();
-       this_cpu_inc(*sc->buffers_allocated);
-
-       /* read this once */
-       head = sc->sr_head;
-
-       /* "allocate" the buffer */
-       start_fill = sc->fill;
-       sc->fill += blocks;
-
-       /*
-        * Fill the parts that the releaser looks at before moving the head.
-        * The only necessary piece is the sent_at field.  The credits
-        * we have just allocated cannot have been returned yet, so the
-        * cb and arg will not be looked at for a "while".  Put them
-        * on this side of the memory barrier anyway.
-        */
-       pbuf = &sc->sr[head].pbuf;
-       pbuf->sent_at = sc->fill;
-       pbuf->cb = cb;
-       pbuf->arg = arg;
-       pbuf->sc = sc;  /* could be filled in at sc->sr init time */
-       /* make sure this is in memory before updating the head */
-
-       /* calculate next head index, do not store */
-       next = head + 1;
-       if (next >= sc->sr_size)
-               next = 0;
-       /*
-        * update the head - must be last! - the releaser can look at fields
-        * in pbuf once we move the head
-        */
-       smp_wmb();
-       sc->sr_head = next;
-       spin_unlock_irqrestore(&sc->alloc_lock, flags);
-
-       /* finish filling in the buffer outside the lock */
-       pbuf->start = sc->base_addr + ((start_fill % sc->credits)
-                                                       * PIO_BLOCK_SIZE);
-       pbuf->size = sc->credits * PIO_BLOCK_SIZE;
-       pbuf->end = sc->base_addr + pbuf->size;
-       pbuf->block_count = blocks;
-       pbuf->qw_written = 0;
-       pbuf->carry_bytes = 0;
-       pbuf->carry.val64 = 0;
-done:
-       return pbuf;
-}
-
-/*
- * There are at least two entities that can turn on credit return
- * interrupts and they can overlap.  Avoid problems by implementing
- * a count scheme that is enforced by a lock.  The lock is needed because
- * the count and CSR write must be paired.
- */
-
-/*
- * Start credit return interrupts.  This is managed by a count.  If already
- * on, just increment the count.
- */
-void sc_add_credit_return_intr(struct send_context *sc)
-{
-       unsigned long flags;
-
-       /* lock must surround both the count change and the CSR update */
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-       if (sc->credit_intr_count == 0) {
-               sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-       }
-       sc->credit_intr_count++;
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-}
-
-/*
- * Stop credit return interrupts.  This is managed by a count.  Decrement the
- * count, if the last user, then turn the credit interrupts off.
- */
-void sc_del_credit_return_intr(struct send_context *sc)
-{
-       unsigned long flags;
-
-       WARN_ON(sc->credit_intr_count == 0);
-
-       /* lock must surround both the count change and the CSR update */
-       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
-       sc->credit_intr_count--;
-       if (sc->credit_intr_count == 0) {
-               sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
-               write_kctxt_csr(sc->dd, sc->hw_context,
-                               SC(CREDIT_CTRL), sc->credit_ctrl);
-       }
-       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
-}
-
-/*
- * The caller must be careful when calling this.  All needint calls
- * must be paired with !needint.
- */
-void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
-{
-       if (needint)
-               sc_add_credit_return_intr(sc);
-       else
-               sc_del_credit_return_intr(sc);
-       trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
-       if (needint) {
-               mmiowb();
-               sc_return_credits(sc);
-       }
-}
-
-/**
- * sc_piobufavail - callback when a PIO buffer is available
- * @sc: the send context
- *
- * This is called from the interrupt handler when a PIO buffer is
- * available after hfi1_verbs_send() returned an error that no buffers were
- * available. Disable the interrupt if there are no more QPs waiting.
- */
-static void sc_piobufavail(struct send_context *sc)
-{
-       struct hfi1_devdata *dd = sc->dd;
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       struct list_head *list;
-       struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
-       struct rvt_qp *qp;
-       struct hfi1_qp_priv *priv;
-       unsigned long flags;
-       unsigned i, n = 0;
-
-       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
-           dd->send_contexts[sc->sw_index].type != SC_VL15)
-               return;
-       list = &sc->piowait;
-       /*
-        * Note: checking that the piowait list is empty and clearing
-        * the buffer available interrupt needs to be atomic or we
-        * could end up with QPs on the wait list with the interrupt
-        * disabled.
-        */
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       while (!list_empty(list)) {
-               struct iowait *wait;
-
-               if (n == ARRAY_SIZE(qps))
-                       break;
-               wait = list_first_entry(list, struct iowait, list);
-               qp = iowait_to_qp(wait);
-               priv = qp->priv;
-               list_del_init(&priv->s_iowait.list);
-               /* refcount held until actual wake up */
-               qps[n++] = qp;
-       }
-       /*
-        * If there had been waiters and there are more
-        * insure that we redo the force to avoid a potential hang.
-        */
-       if (n) {
-               hfi1_sc_wantpiobuf_intr(sc, 0);
-               if (!list_empty(list))
-                       hfi1_sc_wantpiobuf_intr(sc, 1);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-
-       for (i = 0; i < n; i++)
-               hfi1_qp_wakeup(qps[i],
-                              RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
-}
-
-/* translate a send credit update to a bit code of reasons */
-static inline int fill_code(u64 hw_free)
-{
-       int code = 0;
-
-       if (hw_free & CR_STATUS_SMASK)
-               code |= PRC_STATUS_ERR;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
-               code |= PRC_PBC;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
-               code |= PRC_THRESHOLD;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
-               code |= PRC_FILL_ERR;
-       if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
-               code |= PRC_SC_DISABLE;
-       return code;
-}
-
-/* use the jiffies compare to get the wrap right */
-#define sent_before(a, b) time_before(a, b)    /* a < b */
-
-/*
- * The send context buffer "releaser".
- */
-void sc_release_update(struct send_context *sc)
-{
-       struct pio_buf *pbuf;
-       u64 hw_free;
-       u32 head, tail;
-       unsigned long old_free;
-       unsigned long free;
-       unsigned long extra;
-       unsigned long flags;
-       int code;
-
-       if (!sc)
-               return;
-
-       spin_lock_irqsave(&sc->release_lock, flags);
-       /* update free */
-       hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
-       old_free = sc->free;
-       extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
-                       - (old_free & CR_COUNTER_MASK))
-                               & CR_COUNTER_MASK;
-       free = old_free + extra;
-       trace_hfi1_piofree(sc, extra);
-
-       /* call sent buffer callbacks */
-       code = -1;                              /* code not yet set */
-       head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
-       tail = sc->sr_tail;
-       while (head != tail) {
-               pbuf = &sc->sr[tail].pbuf;
-
-               if (sent_before(free, pbuf->sent_at)) {
-                       /* not sent yet */
-                       break;
-               }
-               if (pbuf->cb) {
-                       if (code < 0) /* fill in code on first user */
-                               code = fill_code(hw_free);
-                       (*pbuf->cb)(pbuf->arg, code);
-               }
-
-               tail++;
-               if (tail >= sc->sr_size)
-                       tail = 0;
-       }
-       sc->sr_tail = tail;
-       /* make sure tail is updated before free */
-       smp_wmb();
-       sc->free = free;
-       spin_unlock_irqrestore(&sc->release_lock, flags);
-       sc_piobufavail(sc);
-}
-
-/*
- * Send context group releaser.  Argument is the send context that caused
- * the interrupt.  Called from the send context interrupt handler.
- *
- * Call release on all contexts in the group.
- *
- * This routine takes the sc_lock without an irqsave because it is only
- * called from an interrupt handler.  Adjust if that changes.
- */
-void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
-{
-       struct send_context *sc;
-       u32 sw_index;
-       u32 gc, gc_end;
-
-       spin_lock(&dd->sc_lock);
-       sw_index = dd->hw_to_sw[hw_context];
-       if (unlikely(sw_index >= dd->num_send_contexts)) {
-               dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
-                          __func__, hw_context, sw_index);
-               goto done;
-       }
-       sc = dd->send_contexts[sw_index].sc;
-       if (unlikely(!sc))
-               goto done;
-
-       gc = group_context(hw_context, sc->group);
-       gc_end = gc + group_size(sc->group);
-       for (; gc < gc_end; gc++) {
-               sw_index = dd->hw_to_sw[gc];
-               if (unlikely(sw_index >= dd->num_send_contexts)) {
-                       dd_dev_err(dd,
-                                  "%s: invalid hw (%u) to sw (%u) mapping\n",
-                                  __func__, hw_context, sw_index);
-                       continue;
-               }
-               sc_release_update(dd->send_contexts[sw_index].sc);
-       }
-done:
-       spin_unlock(&dd->sc_lock);
-}
-
-/*
- * pio_select_send_context_vl() - select send context
- * @dd: devdata
- * @selector: a spreading factor
- * @vl: this vl
- *
- * This function returns a send context based on the selector and a vl.
- * The mapping fields are protected by RCU
- */
-struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
-                                               u32 selector, u8 vl)
-{
-       struct pio_vl_map *m;
-       struct pio_map_elem *e;
-       struct send_context *rval;
-
-       /*
-        * NOTE This should only happen if SC->VL changed after the initial
-        * checks on the QP/AH
-        * Default will return VL0's send context below
-        */
-       if (unlikely(vl >= num_vls)) {
-               rval = NULL;
-               goto done;
-       }
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->pio_map);
-       if (unlikely(!m)) {
-               rcu_read_unlock();
-               return dd->vld[0].sc;
-       }
-       e = m->map[vl & m->mask];
-       rval = e->ksc[selector & e->mask];
-       rcu_read_unlock();
-
-done:
-       rval = !rval ? dd->vld[0].sc : rval;
-       return rval;
-}
-
-/*
- * pio_select_send_context_sc() - select send context
- * @dd: devdata
- * @selector: a spreading factor
- * @sc5: the 5 bit sc
- *
- * This function returns an send context based on the selector and an sc
- */
-struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
-                                               u32 selector, u8 sc5)
-{
-       u8 vl = sc_to_vlt(dd, sc5);
-
-       return pio_select_send_context_vl(dd, selector, vl);
-}
-
-/*
- * Free the indicated map struct
- */
-static void pio_map_free(struct pio_vl_map *m)
-{
-       int i;
-
-       for (i = 0; m && i < m->actual_vls; i++)
-               kfree(m->map[i]);
-       kfree(m);
-}
-
-/*
- * Handle RCU callback
- */
-static void pio_map_rcu_callback(struct rcu_head *list)
-{
-       struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
-
-       pio_map_free(m);
-}
-
-/*
- * pio_map_init - called when #vls change
- * @dd: hfi1_devdata
- * @port: port number
- * @num_vls: number of vls
- * @vl_scontexts: per vl send context mapping (optional)
- *
- * This routine changes the mapping based on the number of vls.
- *
- * vl_scontexts is used to specify a non-uniform vl/send context
- * loading. NULL implies auto computing the loading and giving each
- * VL an uniform distribution of send contexts per VL.
- *
- * The auto algorithm computers the sc_per_vl and the number of extra
- * send contexts. Any extra send contexts are added from the last VL
- * on down
- *
- * rcu locking is used here to control access to the mapping fields.
- *
- * If either the num_vls or num_send_contexts are non-power of 2, the
- * array sizes in the struct pio_vl_map and the struct pio_map_elem are
- * rounded up to the next highest power of 2 and the first entry is
- * reused in a round robin fashion.
- *
- * If an error occurs the map change is not done and the mapping is not
- * chaged.
- *
- */
-int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
-{
-       int i, j;
-       int extra, sc_per_vl;
-       int scontext = 1;
-       int num_kernel_send_contexts = 0;
-       u8 lvl_scontexts[OPA_MAX_VLS];
-       struct pio_vl_map *oldmap, *newmap;
-
-       if (!vl_scontexts) {
-               /* send context 0 reserved for VL15 */
-               for (i = 1; i < dd->num_send_contexts; i++)
-                       if (dd->send_contexts[i].type == SC_KERNEL)
-                               num_kernel_send_contexts++;
-               /* truncate divide */
-               sc_per_vl = num_kernel_send_contexts / num_vls;
-               /* extras */
-               extra = num_kernel_send_contexts % num_vls;
-               vl_scontexts = lvl_scontexts;
-               /* add extras from last vl down */
-               for (i = num_vls - 1; i >= 0; i--, extra--)
-                       vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
-       }
-       /* build new map */
-       newmap = kzalloc(sizeof(*newmap) +
-                        roundup_pow_of_two(num_vls) *
-                        sizeof(struct pio_map_elem *),
-                        GFP_KERNEL);
-       if (!newmap)
-               goto bail;
-       newmap->actual_vls = num_vls;
-       newmap->vls = roundup_pow_of_two(num_vls);
-       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
-       for (i = 0; i < newmap->vls; i++) {
-               /* save for wrap around */
-               int first_scontext = scontext;
-
-               if (i < newmap->actual_vls) {
-                       int sz = roundup_pow_of_two(vl_scontexts[i]);
-
-                       /* only allocate once */
-                       newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
-                                                sz * sizeof(struct
-                                                            send_context *),
-                                                GFP_KERNEL);
-                       if (!newmap->map[i])
-                               goto bail;
-                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
-                       /* assign send contexts */
-                       for (j = 0; j < sz; j++) {
-                               if (dd->kernel_send_context[scontext])
-                                       newmap->map[i]->ksc[j] =
-                                       dd->kernel_send_context[scontext];
-                               if (++scontext >= first_scontext +
-                                                 vl_scontexts[i])
-                                       /* wrap back to first send context */
-                                       scontext = first_scontext;
-                       }
-               } else {
-                       /* just re-use entry without allocating */
-                       newmap->map[i] = newmap->map[i % num_vls];
-               }
-               scontext = first_scontext + vl_scontexts[i];
-       }
-       /* newmap in hand, save old map */
-       spin_lock_irq(&dd->pio_map_lock);
-       oldmap = rcu_dereference_protected(dd->pio_map,
-                                          lockdep_is_held(&dd->pio_map_lock));
-
-       /* publish newmap */
-       rcu_assign_pointer(dd->pio_map, newmap);
-
-       spin_unlock_irq(&dd->pio_map_lock);
-       /* success, free any old map after grace period */
-       if (oldmap)
-               call_rcu(&oldmap->list, pio_map_rcu_callback);
-       return 0;
-bail:
-       /* free any partial allocation */
-       pio_map_free(newmap);
-       return -ENOMEM;
-}
-
-void free_pio_map(struct hfi1_devdata *dd)
-{
-       /* Free PIO map if allocated */
-       if (rcu_access_pointer(dd->pio_map)) {
-               spin_lock_irq(&dd->pio_map_lock);
-               pio_map_free(rcu_access_pointer(dd->pio_map));
-               RCU_INIT_POINTER(dd->pio_map, NULL);
-               spin_unlock_irq(&dd->pio_map_lock);
-               synchronize_rcu();
-       }
-       kfree(dd->kernel_send_context);
-       dd->kernel_send_context = NULL;
-}
-
-int init_pervl_scs(struct hfi1_devdata *dd)
-{
-       int i;
-       u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
-       u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
-       u32 ctxt;
-       struct hfi1_pportdata *ppd = dd->pport;
-
-       dd->vld[15].sc = sc_alloc(dd, SC_VL15,
-                                 dd->rcd[0]->rcvhdrqentsize, dd->node);
-       if (!dd->vld[15].sc)
-               goto nomem;
-       hfi1_init_ctxt(dd->vld[15].sc);
-       dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
-
-       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
-                                       sizeof(struct send_context *),
-                                       GFP_KERNEL, dd->node);
-       dd->kernel_send_context[0] = dd->vld[15].sc;
-
-       for (i = 0; i < num_vls; i++) {
-               /*
-                * Since this function does not deal with a specific
-                * receive context but we need the RcvHdrQ entry size,
-                * use the size from rcd[0]. It is guaranteed to be
-                * valid at this point and will remain the same for all
-                * receive contexts.
-                */
-               dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
-                                        dd->rcd[0]->rcvhdrqentsize, dd->node);
-               if (!dd->vld[i].sc)
-                       goto nomem;
-               dd->kernel_send_context[i + 1] = dd->vld[i].sc;
-               hfi1_init_ctxt(dd->vld[i].sc);
-               /* non VL15 start with the max MTU */
-               dd->vld[i].mtu = hfi1_max_mtu;
-       }
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
-               dd->kernel_send_context[i + 1] =
-               sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
-               if (!dd->kernel_send_context[i + 1])
-                       goto nomem;
-               hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
-       }
-
-       sc_enable(dd->vld[15].sc);
-       ctxt = dd->vld[15].sc->hw_context;
-       mask = all_vl_mask & ~(1LL << 15);
-       write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       dd_dev_info(dd,
-                   "Using send context %u(%u) for VL15\n",
-                   dd->vld[15].sc->sw_index, ctxt);
-
-       for (i = 0; i < num_vls; i++) {
-               sc_enable(dd->vld[i].sc);
-               ctxt = dd->vld[i].sc->hw_context;
-               mask = all_vl_mask & ~(data_vls_mask);
-               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       }
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
-               sc_enable(dd->kernel_send_context[i + 1]);
-               ctxt = dd->kernel_send_context[i + 1]->hw_context;
-               mask = all_vl_mask & ~(data_vls_mask);
-               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
-       }
-
-       if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
-               goto nomem;
-       return 0;
-nomem:
-       sc_free(dd->vld[15].sc);
-       for (i = 0; i < num_vls; i++)
-               sc_free(dd->vld[i].sc);
-       for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
-               sc_free(dd->kernel_send_context[i + 1]);
-       return -ENOMEM;
-}
-
-int init_credit_return(struct hfi1_devdata *dd)
-{
-       int ret;
-       int num_numa;
-       int i;
-
-       num_numa = num_online_nodes();
-       /* enforce the expectation that the numas are compact */
-       for (i = 0; i < num_numa; i++) {
-               if (!node_online(i)) {
-                       dd_dev_err(dd, "NUMA nodes are not compact\n");
-                       ret = -EINVAL;
-                       goto done;
-               }
-       }
-
-       dd->cr_base = kcalloc(
-               num_numa,
-               sizeof(struct credit_return_base),
-               GFP_KERNEL);
-       if (!dd->cr_base) {
-               dd_dev_err(dd, "Unable to allocate credit return base\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-       for (i = 0; i < num_numa; i++) {
-               int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
-
-               set_dev_node(&dd->pcidev->dev, i);
-               dd->cr_base[i].va = dma_zalloc_coherent(
-                                       &dd->pcidev->dev,
-                                       bytes,
-                                       &dd->cr_base[i].pa,
-                                       GFP_KERNEL);
-               if (!dd->cr_base[i].va) {
-                       set_dev_node(&dd->pcidev->dev, dd->node);
-                       dd_dev_err(dd,
-                                  "Unable to allocate credit return DMA range for NUMA %d\n",
-                                  i);
-                       ret = -ENOMEM;
-                       goto done;
-               }
-       }
-       set_dev_node(&dd->pcidev->dev, dd->node);
-
-       ret = 0;
-done:
-       return ret;
-}
-
-void free_credit_return(struct hfi1_devdata *dd)
-{
-       int num_numa;
-       int i;
-
-       if (!dd->cr_base)
-               return;
-
-       num_numa = num_online_nodes();
-       for (i = 0; i < num_numa; i++) {
-               if (dd->cr_base[i].va) {
-                       dma_free_coherent(&dd->pcidev->dev,
-                                         TXE_NUM_CONTEXTS *
-                                         sizeof(struct credit_return),
-                                         dd->cr_base[i].va,
-                                         dd->cr_base[i].pa);
-               }
-       }
-       kfree(dd->cr_base);
-       dd->cr_base = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
deleted file mode 100644 (file)
index 53a08ed..0000000
+++ /dev/null
@@ -1,328 +0,0 @@
-#ifndef _PIO_H
-#define _PIO_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-/* send context types */
-#define SC_KERNEL 0
-#define SC_ACK    1
-#define SC_USER   2
-#define SC_VL15   3
-#define SC_MAX    4
-
-/* invalid send context index */
-#define INVALID_SCI 0xff
-
-/* PIO buffer release callback function */
-typedef void (*pio_release_cb)(void *arg, int code);
-
-/* PIO release codes - in bits, as there could more than one that apply */
-#define PRC_OK         0       /* no known error */
-#define PRC_STATUS_ERR 0x01    /* credit return due to status error */
-#define PRC_PBC                0x02    /* credit return due to PBC */
-#define PRC_THRESHOLD  0x04    /* credit return due to threshold */
-#define PRC_FILL_ERR   0x08    /* credit return due fill error */
-#define PRC_FORCE      0x10    /* credit return due credit force */
-#define PRC_SC_DISABLE 0x20    /* clean-up after a context disable */
-
-/* byte helper */
-union mix {
-       u64 val64;
-       u32 val32[2];
-       u8  val8[8];
-};
-
-/* an allocated PIO buffer */
-struct pio_buf {
-       struct send_context *sc;/* back pointer to owning send context */
-       pio_release_cb cb;      /* called when the buffer is released */
-       void *arg;              /* argument for cb */
-       void __iomem *start;    /* buffer start address */
-       void __iomem *end;      /* context end address */
-       unsigned long size;     /* context size, in bytes */
-       unsigned long sent_at;  /* buffer is sent when <= free */
-       u32 block_count;        /* size of buffer, in blocks */
-       u32 qw_written;         /* QW written so far */
-       u32 carry_bytes;        /* number of valid bytes in carry */
-       union mix carry;        /* pending unwritten bytes */
-};
-
-/* cache line aligned pio buffer array */
-union pio_shadow_ring {
-       struct pio_buf pbuf;
-       u64 unused[16];         /* cache line spacer */
-} ____cacheline_aligned;
-
-/* per-NUMA send context */
-struct send_context {
-       /* read-only after init */
-       struct hfi1_devdata *dd;                /* device */
-       void __iomem *base_addr;        /* start of PIO memory */
-       union pio_shadow_ring *sr;      /* shadow ring */
-
-       volatile __le64 *hw_free;       /* HW free counter */
-       struct work_struct halt_work;   /* halted context work queue entry */
-       unsigned long flags;            /* flags */
-       int node;                       /* context home node */
-       int type;                       /* context type */
-       u32 sw_index;                   /* software index number */
-       u32 hw_context;                 /* hardware context number */
-       u32 credits;                    /* number of blocks in context */
-       u32 sr_size;                    /* size of the shadow ring */
-       u32 group;                      /* credit return group */
-       /* allocator fields */
-       spinlock_t alloc_lock ____cacheline_aligned_in_smp;
-       unsigned long fill;             /* official alloc count */
-       unsigned long alloc_free;       /* copy of free (less cache thrash) */
-       u32 sr_head;                    /* shadow ring head */
-       /* releaser fields */
-       spinlock_t release_lock ____cacheline_aligned_in_smp;
-       unsigned long free;             /* official free count */
-       u32 sr_tail;                    /* shadow ring tail */
-       /* list for PIO waiters */
-       struct list_head piowait  ____cacheline_aligned_in_smp;
-       spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
-       u64 credit_ctrl;                /* cache for credit control */
-       u32 credit_intr_count;          /* count of credit intr users */
-       u32 __percpu *buffers_allocated;/* count of buffers allocated */
-       wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
-};
-
-/* send context flags */
-#define SCF_ENABLED 0x01
-#define SCF_IN_FREE 0x02
-#define SCF_HALTED  0x04
-#define SCF_FROZEN  0x08
-
-struct send_context_info {
-       struct send_context *sc;        /* allocated working context */
-       u16 allocated;                  /* has this been allocated? */
-       u16 type;                       /* context type */
-       u16 base;                       /* base in PIO array */
-       u16 credits;                    /* size in PIO array */
-};
-
-/* DMA credit return, index is always (context & 0x7) */
-struct credit_return {
-       volatile __le64 cr[8];
-};
-
-/* NUMA indexed credit return array */
-struct credit_return_base {
-       struct credit_return *va;
-       dma_addr_t pa;
-};
-
-/* send context configuration sizes (one per type) */
-struct sc_config_sizes {
-       short int size;
-       short int count;
-};
-
-/*
- * The diagram below details the relationship of the mapping structures
- *
- * Since the mapping now allows for non-uniform send contexts per vl, the
- * number of send contexts for a vl is either the vl_scontexts[vl] or
- * a computation based on num_kernel_send_contexts/num_vls:
- *
- * For example:
- * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
- *
- * n = roundup to next highest power of 2 using nactual
- *
- * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
- * evenly, the extras are added from the last vl downward.
- *
- * For the case where n > nactual, the send contexts are assigned
- * in a round robin fashion wrapping back to the first send context
- * for a particular vl.
- *
- *               dd->pio_map
- *                    |                                   pio_map_elem[0]
- *                    |                                +--------------------+
- *                    v                                |       mask         |
- *               pio_vl_map                            |--------------------|
- *      +--------------------------+                   | ksc[0] -> sc 1     |
- *      |    list (RCU)            |                   |--------------------|
- *      |--------------------------|                 ->| ksc[1] -> sc 2     |
- *      |    mask                  |              --/  |--------------------|
- *      |--------------------------|            -/     |        *           |
- *      |    actual_vls (max 8)    |          -/       |--------------------|
- *      |--------------------------|       --/         | ksc[n] -> sc n     |
- *      |    vls (max 8)           |     -/            +--------------------+
- *      |--------------------------|  --/
- *      |    map[0]                |-/
- *      |--------------------------|                   +--------------------+
- *      |    map[1]                |---                |       mask         |
- *      |--------------------------|   \----           |--------------------|
- *      |           *              |        \--        | ksc[0] -> sc 1+n   |
- *      |           *              |           \----   |--------------------|
- *      |           *              |                \->| ksc[1] -> sc 2+n   |
- *      |--------------------------|                   |--------------------|
- *      |   map[vls - 1]           |-                  |         *          |
- *      +--------------------------+ \-                |--------------------|
- *                                     \-              | ksc[m] -> sc m+n   |
- *                                       \             +--------------------+
- *                                        \-
- *                                          \
- *                                           \-        +--------------------+
- *                                             \-      |       mask         |
- *                                               \     |--------------------|
- *                                                \-   | ksc[0] -> sc 1+m+n |
- *                                                  \- |--------------------|
- *                                                    >| ksc[1] -> sc 2+m+n |
- *                                                     |--------------------|
- *                                                     |         *          |
- *                                                     |--------------------|
- *                                                     | ksc[o] -> sc o+m+n |
- *                                                     +--------------------+
- *
- */
-
-/* Initial number of send contexts per VL */
-#define INIT_SC_PER_VL 2
-
-/*
- * struct pio_map_elem - mapping for a vl
- * @mask - selector mask
- * @ksc - array of kernel send contexts for this vl
- *
- * The mask is used to "mod" the selector to
- * produce index into the trailing array of
- * kscs
- */
-struct pio_map_elem {
-       u32 mask;
-       struct send_context *ksc[0];
-};
-
-/*
- * struct pio_vl_map - mapping for a vl
- * @list - rcu head for free callback
- * @mask - vl mask to "mod" the vl to produce an index to map array
- * @actual_vls - number of vls
- * @vls - numbers of vls rounded to next power of 2
- * @map - array of pio_map_elem entries
- *
- * This is the parent mapping structure. The trailing members of the
- * struct point to pio_map_elem entries, which in turn point to an
- * array of kscs for that vl.
- */
-struct pio_vl_map {
-       struct rcu_head list;
-       u32 mask;
-       u8 actual_vls;
-       u8 vls;
-       struct pio_map_elem *map[0];
-};
-
-int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
-                u8 *vl_scontexts);
-void free_pio_map(struct hfi1_devdata *dd);
-struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
-                                               u32 selector, u8 vl);
-struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
-                                               u32 selector, u8 sc5);
-
-/* send context functions */
-int init_credit_return(struct hfi1_devdata *dd);
-void free_credit_return(struct hfi1_devdata *dd);
-int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
-int init_send_contexts(struct hfi1_devdata *dd);
-int init_credit_return(struct hfi1_devdata *dd);
-int init_pervl_scs(struct hfi1_devdata *dd);
-struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
-                             uint hdrqentsize, int numa);
-void sc_free(struct send_context *sc);
-int sc_enable(struct send_context *sc);
-void sc_disable(struct send_context *sc);
-int sc_restart(struct send_context *sc);
-void sc_return_credits(struct send_context *sc);
-void sc_flush(struct send_context *sc);
-void sc_drop(struct send_context *sc);
-void sc_stop(struct send_context *sc, int bit);
-struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
-                               pio_release_cb cb, void *arg);
-void sc_release_update(struct send_context *sc);
-void sc_return_credits(struct send_context *sc);
-void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
-void sc_add_credit_return_intr(struct send_context *sc);
-void sc_del_credit_return_intr(struct send_context *sc);
-void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
-u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
-u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
-void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
-void sc_wait(struct hfi1_devdata *dd);
-void set_pio_integrity(struct send_context *sc);
-
-/* support functions */
-void pio_reset_all(struct hfi1_devdata *dd);
-void pio_freeze(struct hfi1_devdata *dd);
-void pio_kernel_unfreeze(struct hfi1_devdata *dd);
-
-/* global PIO send control operations */
-#define PSC_GLOBAL_ENABLE 0
-#define PSC_GLOBAL_DISABLE 1
-#define PSC_GLOBAL_VLARB_ENABLE 2
-#define PSC_GLOBAL_VLARB_DISABLE 3
-#define PSC_CM_RESET 4
-#define PSC_DATA_VL_ENABLE 5
-#define PSC_DATA_VL_DISABLE 6
-
-void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
-void pio_send_control(struct hfi1_devdata *dd, int op);
-
-/* PIO copy routines */
-void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
-             const void *from, size_t count);
-void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
-                       const void *from, size_t nbytes);
-void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
-void seg_pio_copy_end(struct pio_buf *pbuf);
-
-#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
deleted file mode 100644 (file)
index 8c25e1b..0000000
+++ /dev/null
@@ -1,867 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-
-/* additive distance between non-SOP and SOP space */
-#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
-#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
-/* number of QUADWORDs in a block */
-#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
-
-/**
- * pio_copy - copy data block to MMIO space
- * @pbuf: a number of blocks allocated within a PIO send context
- * @pbc: PBC to send
- * @from: source, must be 8 byte aligned
- * @count: number of DWORD (32-bit) quantities to copy from source
- *
- * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
- * Must always write full BLOCK_SIZE bytes blocks.  The first block must
- * be written to the corresponding SOP=1 address.
- *
- * Known:
- * o pbuf->start always starts on a block boundary
- * o pbuf can wrap only at a block boundary
- */
-void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
-             const void *from, size_t count)
-{
-       void __iomem *dest = pbuf->start + SOP_DISTANCE;
-       void __iomem *send = dest + PIO_BLOCK_SIZE;
-       void __iomem *dend;                     /* 8-byte data end */
-
-       /* write the PBC */
-       writeq(pbc, dest);
-       dest += sizeof(u64);
-
-       /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((count >> 1) * sizeof(u64));
-
-       if (dend < send) {
-               /*
-                * all QWORD data is within the SOP block, does *not*
-                * reach the end of the SOP block
-                */
-
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /*
-                * No boundary checks are needed here:
-                * 0. We're not on the SOP block boundary
-                * 1. The possible DWORD dangle will still be within
-                *    the SOP block
-                * 2. We cannot wrap except on a block boundary.
-                */
-       } else {
-               /* QWORD data extends _to_ or beyond the SOP block */
-
-               /* write 8-byte SOP chunk data */
-               while (dest < send) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /* drop out of the SOP range */
-               dest -= SOP_DISTANCE;
-               dend -= SOP_DISTANCE;
-
-               /*
-                * If the wrap comes before or matches the data end,
-                * copy until until the wrap, then wrap.
-                *
-                * If the data ends at the end of the SOP above and
-                * the buffer wraps, then pbuf->end == dend == dest
-                * and nothing will get written, but we will wrap in
-                * case there is a dangling DWORD.
-                */
-               if (pbuf->end <= dend) {
-                       while (dest < pbuf->end) {
-                               writeq(*(u64 *)from, dest);
-                               from += sizeof(u64);
-                               dest += sizeof(u64);
-                       }
-
-                       dest -= pbuf->size;
-                       dend -= pbuf->size;
-               }
-
-               /* write 8-byte non-SOP, non-wrap chunk data */
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-       }
-       /* at this point we have wrapped if we are going to wrap */
-
-       /* write dangling u32, if any */
-       if (count & 1) {
-               union mix val;
-
-               val.val64 = 0;
-               val.val32[0] = *(u32 *)from;
-               writeq(val.val64, dest);
-               dest += sizeof(u64);
-       }
-       /*
-        * fill in rest of block, no need to check pbuf->end
-        * as we only wrap on a block boundary
-        */
-       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
-               writeq(0, dest);
-               dest += sizeof(u64);
-       }
-
-       /* finished with this buffer */
-       this_cpu_dec(*pbuf->sc->buffers_allocated);
-       preempt_enable();
-}
-
-/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
-#define USE_SHIFTS 1
-#ifdef USE_SHIFTS
-/*
- * Handle carry bytes using shifts and masks.
- *
- * NOTE: the value the unused portion of carry is expected to always be zero.
- */
-
-/*
- * "zero" shift - bit shift used to zero out upper bytes.  Input is
- * the count of LSB bytes to preserve.
- */
-#define zshift(x) (8 * (8 - (x)))
-
-/*
- * "merge" shift - bit shift used to merge with carry bytes.  Input is
- * the LSB byte count to move beyond.
- */
-#define mshift(x) (8 * (x))
-
-/*
- * Read nbytes bytes from "from" and return them in the LSB bytes
- * of pbuf->carry.  Other bytes are zeroed.  Any previous value
- * pbuf->carry is lost.
- *
- * NOTES:
- * o do not read from from if nbytes is zero
- * o from may _not_ be u64 aligned
- * o nbytes must not span a QW boundary
- */
-static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                 unsigned int nbytes)
-{
-       unsigned long off;
-
-       if (nbytes == 0) {
-               pbuf->carry.val64 = 0;
-       } else {
-               /* align our pointer */
-               off = (unsigned long)from & 0x7;
-               from = (void *)((unsigned long)from & ~0x7l);
-               pbuf->carry.val64 = ((*(u64 *)from)
-                               << zshift(nbytes + off))/* zero upper bytes */
-                               >> zshift(nbytes);      /* place at bottom */
-       }
-       pbuf->carry_bytes = nbytes;
-}
-
-/*
- * Read nbytes bytes from "from" and put them at the next significant bytes
- * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
- * read does not overfill carry.
- *
- * NOTES:
- * o from may _not_ be u64 aligned
- * o nbytes may span a QW boundary
- */
-static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                   const void *from, unsigned int nbytes)
-{
-       unsigned long off = (unsigned long)from & 0x7;
-       unsigned int room, xbytes;
-
-       /* align our pointer */
-       from = (void *)((unsigned long)from & ~0x7l);
-
-       /* check count first - don't read anything if count is zero */
-       while (nbytes) {
-               /* find the number of bytes in this u64 */
-               room = 8 - off; /* this u64 has room for this many bytes */
-               xbytes = min(room, nbytes);
-
-               /*
-                * shift down to zero lower bytes, shift up to zero upper
-                * bytes, shift back down to move into place
-                */
-               pbuf->carry.val64 |= (((*(u64 *)from)
-                                       >> mshift(off))
-                                       << zshift(xbytes))
-                                       >> zshift(xbytes + pbuf->carry_bytes);
-               off = 0;
-               pbuf->carry_bytes += xbytes;
-               nbytes -= xbytes;
-               from += sizeof(u64);
-       }
-}
-
-/*
- * Zero extra bytes from the end of pbuf->carry.
- *
- * NOTES:
- * o zbytes <= old_bytes
- */
-static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
-{
-       unsigned int remaining;
-
-       if (zbytes == 0)        /* nothing to do */
-               return;
-
-       remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
-
-       /* NOTE: zshift only guaranteed to work if remaining != 0 */
-       if (remaining)
-               pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
-                                       >> zshift(remaining);
-       else
-               pbuf->carry.val64 = 0;
-       pbuf->carry_bytes = remaining;
-}
-
-/*
- * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
- * Put the unused part of the next 8 bytes of src into the LSB bytes of
- * pbuf->carry with the upper bytes zeroed..
- *
- * NOTES:
- * o result must keep unused bytes zeroed
- * o src must be u64 aligned
- */
-static inline void merge_write8(
-       struct pio_buf *pbuf,
-       void __iomem *dest,
-       const void *src)
-{
-       u64 new, temp;
-
-       new = *(u64 *)src;
-       temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
-       writeq(temp, dest);
-       pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
-}
-
-/*
- * Write a quad word using all bytes of carry.
- */
-static inline void carry8_write8(union mix carry, void __iomem *dest)
-{
-       writeq(carry.val64, dest);
-}
-
-/*
- * Write a quad word using all the valid bytes of carry.  If carry
- * has zero valid bytes, nothing is written.
- * Returns 0 on nothing written, non-zero on quad word written.
- */
-static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
-{
-       if (pbuf->carry_bytes) {
-               /* unused bytes are always kept zeroed, so just write */
-               writeq(pbuf->carry.val64, dest);
-               return 1;
-       }
-
-       return 0;
-}
-
-#else /* USE_SHIFTS */
-/*
- * Handle carry bytes using byte copies.
- *
- * NOTE: the value the unused portion of carry is left uninitialized.
- */
-
-/*
- * Jump copy - no-loop copy for < 8 bytes.
- */
-static inline void jcopy(u8 *dest, const u8 *src, u32 n)
-{
-       switch (n) {
-       case 7:
-               *dest++ = *src++;
-       case 6:
-               *dest++ = *src++;
-       case 5:
-               *dest++ = *src++;
-       case 4:
-               *dest++ = *src++;
-       case 3:
-               *dest++ = *src++;
-       case 2:
-               *dest++ = *src++;
-       case 1:
-               *dest++ = *src++;
-       }
-}
-
-/*
- * Read nbytes from "from" and and place them in the low bytes
- * of pbuf->carry.  Other bytes are left as-is.  Any previous
- * value in pbuf->carry is lost.
- *
- * NOTES:
- * o do not read from from if nbytes is zero
- * o from may _not_ be u64 aligned.
- */
-static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
-                                 unsigned int nbytes)
-{
-       jcopy(&pbuf->carry.val8[0], from, nbytes);
-       pbuf->carry_bytes = nbytes;
-}
-
-/*
- * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
- * It is expected that the extra read does not overfill carry.
- *
- * NOTES:
- * o from may _not_ be u64 aligned
- * o nbytes may span a QW boundary
- */
-static inline void read_extra_bytes(struct pio_buf *pbuf,
-                                   const void *from, unsigned int nbytes)
-{
-       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
-       pbuf->carry_bytes += nbytes;
-}
-
-/*
- * Zero extra bytes from the end of pbuf->carry.
- *
- * We do not care about the value of unused bytes in carry, so just
- * reduce the byte count.
- *
- * NOTES:
- * o zbytes <= old_bytes
- */
-static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
-{
-       pbuf->carry_bytes -= zbytes;
-}
-
-/*
- * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
- * Put the unused part of the next 8 bytes of src into the low bytes of
- * pbuf->carry.
- */
-static inline void merge_write8(
-       struct pio_buf *pbuf,
-       void *dest,
-       const void *src)
-{
-       u32 remainder = 8 - pbuf->carry_bytes;
-
-       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
-       writeq(pbuf->carry.val64, dest);
-       jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
-}
-
-/*
- * Write a quad word using all bytes of carry.
- */
-static inline void carry8_write8(union mix carry, void *dest)
-{
-       writeq(carry.val64, dest);
-}
-
-/*
- * Write a quad word using all the valid bytes of carry.  If carry
- * has zero valid bytes, nothing is written.
- * Returns 0 on nothing written, non-zero on quad word written.
- */
-static inline int carry_write8(struct pio_buf *pbuf, void *dest)
-{
-       if (pbuf->carry_bytes) {
-               u64 zero = 0;
-
-               jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
-                     8 - pbuf->carry_bytes);
-               writeq(pbuf->carry.val64, dest);
-               return 1;
-       }
-
-       return 0;
-}
-#endif /* USE_SHIFTS */
-
-/*
- * Segmented PIO Copy - start
- *
- * Start a PIO copy.
- *
- * @pbuf: destination buffer
- * @pbc: the PBC for the PIO buffer
- * @from: data source, QWORD aligned
- * @nbytes: bytes to copy
- */
-void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
-                       const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + SOP_DISTANCE;
-       void __iomem *send = dest + PIO_BLOCK_SIZE;
-       void __iomem *dend;                     /* 8-byte data end */
-
-       writeq(pbc, dest);
-       dest += sizeof(u64);
-
-       /* calculate where the QWORD data ends - in SOP=1 space */
-       dend = dest + ((nbytes >> 3) * sizeof(u64));
-
-       if (dend < send) {
-               /*
-                * all QWORD data is within the SOP block, does *not*
-                * reach the end of the SOP block
-                */
-
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /*
-                * No boundary checks are needed here:
-                * 0. We're not on the SOP block boundary
-                * 1. The possible DWORD dangle will still be within
-                *    the SOP block
-                * 2. We cannot wrap except on a block boundary.
-                */
-       } else {
-               /* QWORD data extends _to_ or beyond the SOP block */
-
-               /* write 8-byte SOP chunk data */
-               while (dest < send) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-               /* drop out of the SOP range */
-               dest -= SOP_DISTANCE;
-               dend -= SOP_DISTANCE;
-
-               /*
-                * If the wrap comes before or matches the data end,
-                * copy until until the wrap, then wrap.
-                *
-                * If the data ends at the end of the SOP above and
-                * the buffer wraps, then pbuf->end == dend == dest
-                * and nothing will get written, but we will wrap in
-                * case there is a dangling DWORD.
-                */
-               if (pbuf->end <= dend) {
-                       while (dest < pbuf->end) {
-                               writeq(*(u64 *)from, dest);
-                               from += sizeof(u64);
-                               dest += sizeof(u64);
-                       }
-
-                       dest -= pbuf->size;
-                       dend -= pbuf->size;
-               }
-
-               /* write 8-byte non-SOP, non-wrap chunk data */
-               while (dest < dend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-       }
-       /* at this point we have wrapped if we are going to wrap */
-
-       /* ...but it doesn't matter as we're done writing */
-
-       /* save dangling bytes, if any */
-       read_low_bytes(pbuf, from, nbytes & 0x7);
-
-       pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
-}
-
-/*
- * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
- * bytes are non-zero.
- *
- * Whole u64s must be written to the chip, so bytes must be manually merged.
- *
- * @pbuf: destination buffer
- * @from: data source, is QWORD aligned.
- * @nbytes: bytes to copy
- *
- * Must handle nbytes < 8.
- */
-static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-       void __iomem *dend;                     /* 8-byte data end */
-       unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
-       unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
-
-       /* calculate 8-byte data end */
-       dend = dest + (qw_to_write * sizeof(u64));
-
-       if (pbuf->qw_written < PIO_BLOCK_QWS) {
-               /*
-                * Still within SOP block.  We don't need to check for
-                * wrap because we are still in the first block and
-                * can only wrap on block boundaries.
-                */
-               void __iomem *send;             /* SOP end */
-               void __iomem *xend;
-
-               /*
-                * calculate the end of data or end of block, whichever
-                * comes first
-                */
-               send = pbuf->start + PIO_BLOCK_SIZE;
-               xend = min(send, dend);
-
-               /* shift up to SOP=1 space */
-               dest += SOP_DISTANCE;
-               xend += SOP_DISTANCE;
-
-               /* write 8-byte chunk data */
-               while (dest < xend) {
-                       merge_write8(pbuf, dest, from);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               /* shift down to SOP=0 space */
-               dest -= SOP_DISTANCE;
-       }
-       /*
-        * At this point dest could be (either, both, or neither):
-        * - at dend
-        * - at the wrap
-        */
-
-       /*
-        * If the wrap comes before or matches the data end,
-        * copy until until the wrap, then wrap.
-        *
-        * If dest is at the wrap, we will fall into the if,
-        * not do the loop, when wrap.
-        *
-        * If the data ends at the end of the SOP above and
-        * the buffer wraps, then pbuf->end == dend == dest
-        * and nothing will get written.
-        */
-       if (pbuf->end <= dend) {
-               while (dest < pbuf->end) {
-                       merge_write8(pbuf, dest, from);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               dest -= pbuf->size;
-               dend -= pbuf->size;
-       }
-
-       /* write 8-byte non-SOP, non-wrap chunk data */
-       while (dest < dend) {
-               merge_write8(pbuf, dest, from);
-               from += sizeof(u64);
-               dest += sizeof(u64);
-       }
-
-       /* adjust carry */
-       if (pbuf->carry_bytes < bytes_left) {
-               /* need to read more */
-               read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
-       } else {
-               /* remove invalid bytes */
-               zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
-       }
-
-       pbuf->qw_written += qw_to_write;
-}
-
-/*
- * Mid copy helper, "straight case" - source pointer is 64-bit aligned
- * with no carry bytes.
- *
- * @pbuf: destination buffer
- * @from: data source, is QWORD aligned
- * @nbytes: bytes to copy
- *
- * Must handle nbytes < 8.
- */
-static void mid_copy_straight(struct pio_buf *pbuf,
-                             const void *from, size_t nbytes)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-       void __iomem *dend;                     /* 8-byte data end */
-
-       /* calculate 8-byte data end */
-       dend = dest + ((nbytes >> 3) * sizeof(u64));
-
-       if (pbuf->qw_written < PIO_BLOCK_QWS) {
-               /*
-                * Still within SOP block.  We don't need to check for
-                * wrap because we are still in the first block and
-                * can only wrap on block boundaries.
-                */
-               void __iomem *send;             /* SOP end */
-               void __iomem *xend;
-
-               /*
-                * calculate the end of data or end of block, whichever
-                * comes first
-                */
-               send = pbuf->start + PIO_BLOCK_SIZE;
-               xend = min(send, dend);
-
-               /* shift up to SOP=1 space */
-               dest += SOP_DISTANCE;
-               xend += SOP_DISTANCE;
-
-               /* write 8-byte chunk data */
-               while (dest < xend) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               /* shift down to SOP=0 space */
-               dest -= SOP_DISTANCE;
-       }
-       /*
-        * At this point dest could be (either, both, or neither):
-        * - at dend
-        * - at the wrap
-        */
-
-       /*
-        * If the wrap comes before or matches the data end,
-        * copy until until the wrap, then wrap.
-        *
-        * If dest is at the wrap, we will fall into the if,
-        * not do the loop, when wrap.
-        *
-        * If the data ends at the end of the SOP above and
-        * the buffer wraps, then pbuf->end == dend == dest
-        * and nothing will get written.
-        */
-       if (pbuf->end <= dend) {
-               while (dest < pbuf->end) {
-                       writeq(*(u64 *)from, dest);
-                       from += sizeof(u64);
-                       dest += sizeof(u64);
-               }
-
-               dest -= pbuf->size;
-               dend -= pbuf->size;
-       }
-
-       /* write 8-byte non-SOP, non-wrap chunk data */
-       while (dest < dend) {
-               writeq(*(u64 *)from, dest);
-               from += sizeof(u64);
-               dest += sizeof(u64);
-       }
-
-       /* we know carry_bytes was zero on entry to this routine */
-       read_low_bytes(pbuf, from, nbytes & 0x7);
-
-       pbuf->qw_written += nbytes >> 3;
-}
-
-/*
- * Segmented PIO Copy - middle
- *
- * Must handle any aligned tail and any aligned source with any byte count.
- *
- * @pbuf: a number of blocks allocated within a PIO send context
- * @from: data source
- * @nbytes: number of bytes to copy
- */
-void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
-{
-       unsigned long from_align = (unsigned long)from & 0x7;
-
-       if (pbuf->carry_bytes + nbytes < 8) {
-               /* not enough bytes to fill a QW */
-               read_extra_bytes(pbuf, from, nbytes);
-               return;
-       }
-
-       if (from_align) {
-               /* misaligned source pointer - align it */
-               unsigned long to_align;
-
-               /* bytes to read to align "from" */
-               to_align = 8 - from_align;
-
-               /*
-                * In the advance-to-alignment logic below, we do not need
-                * to check if we are using more than nbytes.  This is because
-                * if we are here, we already know that carry+nbytes will
-                * fill at least one QW.
-                */
-               if (pbuf->carry_bytes + to_align < 8) {
-                       /* not enough align bytes to fill a QW */
-                       read_extra_bytes(pbuf, from, to_align);
-                       from += to_align;
-                       nbytes -= to_align;
-               } else {
-                       /* bytes to fill carry */
-                       unsigned long to_fill = 8 - pbuf->carry_bytes;
-                       /* bytes left over to be read */
-                       unsigned long extra = to_align - to_fill;
-                       void __iomem *dest;
-
-                       /* fill carry... */
-                       read_extra_bytes(pbuf, from, to_fill);
-                       from += to_fill;
-                       nbytes -= to_fill;
-
-                       /* ...now write carry */
-                       dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-
-                       /*
-                        * The two checks immediately below cannot both be
-                        * true, hence the else.  If we have wrapped, we
-                        * cannot still be within the first block.
-                        * Conversely, if we are still in the first block, we
-                        * cannot have wrapped.  We do the wrap check first
-                        * as that is more likely.
-                        */
-                       /* adjust if we've wrapped */
-                       if (dest >= pbuf->end)
-                               dest -= pbuf->size;
-                       /* jump to SOP range if within the first block */
-                       else if (pbuf->qw_written < PIO_BLOCK_QWS)
-                               dest += SOP_DISTANCE;
-
-                       carry8_write8(pbuf->carry, dest);
-                       pbuf->qw_written++;
-
-                       /* read any extra bytes to do final alignment */
-                       /* this will overwrite anything in pbuf->carry */
-                       read_low_bytes(pbuf, from, extra);
-                       from += extra;
-                       nbytes -= extra;
-               }
-
-               /* at this point, from is QW aligned */
-       }
-
-       if (pbuf->carry_bytes)
-               mid_copy_mix(pbuf, from, nbytes);
-       else
-               mid_copy_straight(pbuf, from, nbytes);
-}
-
-/*
- * Segmented PIO Copy - end
- *
- * Write any remainder (in pbuf->carry) and finish writing the whole block.
- *
- * @pbuf: a number of blocks allocated within a PIO send context
- */
-void seg_pio_copy_end(struct pio_buf *pbuf)
-{
-       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
-
-       /*
-        * The two checks immediately below cannot both be true, hence the
-        * else.  If we have wrapped, we cannot still be within the first
-        * block.  Conversely, if we are still in the first block, we
-        * cannot have wrapped.  We do the wrap check first as that is
-        * more likely.
-        */
-       /* adjust if we have wrapped */
-       if (dest >= pbuf->end)
-               dest -= pbuf->size;
-       /* jump to the SOP range if within the first block */
-       else if (pbuf->qw_written < PIO_BLOCK_QWS)
-               dest += SOP_DISTANCE;
-
-       /* write final bytes, if any */
-       if (carry_write8(pbuf, dest)) {
-               dest += sizeof(u64);
-               /*
-                * NOTE: We do not need to recalculate whether dest needs
-                * SOP_DISTANCE or not.
-                *
-                * If we are in the first block and the dangle write
-                * keeps us in the same block, dest will need
-                * to retain SOP_DISTANCE in the loop below.
-                *
-                * If we are in the first block and the dangle write pushes
-                * us to the next block, then loop below will not run
-                * and dest is not used.  Hence we do not need to update
-                * it.
-                *
-                * If we are past the first block, then SOP_DISTANCE
-                * was never added, so there is nothing to do.
-                */
-       }
-
-       /* fill in rest of block */
-       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
-               writeq(0, dest);
-               dest += sizeof(u64);
-       }
-
-       /* finished with this buffer */
-       this_cpu_dec(*pbuf->sc->buffers_allocated);
-       preempt_enable();
-}
diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c
deleted file mode 100644 (file)
index 8fe8a20..0000000
+++ /dev/null
@@ -1,902 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "efivar.h"
-
-void get_platform_config(struct hfi1_devdata *dd)
-{
-       int ret = 0;
-       unsigned long size = 0;
-       u8 *temp_platform_config = NULL;
-
-       ret = read_hfi1_efi_var(dd, "configuration", &size,
-                               (void **)&temp_platform_config);
-       if (ret) {
-               dd_dev_info(dd,
-                           "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
-                           __func__);
-               /* fall back to request firmware */
-               platform_config_load = 1;
-               goto bail;
-       }
-
-       dd->platform_config.data = temp_platform_config;
-       dd->platform_config.size = size;
-
-bail:
-       /* exit */;
-}
-
-void free_platform_config(struct hfi1_devdata *dd)
-{
-       if (!platform_config_load) {
-               /*
-                * was loaded from EFI, release memory
-                * allocated by read_efi_var
-                */
-               kfree(dd->platform_config.data);
-       }
-       /*
-        * else do nothing, dispose_firmware will release
-        * struct firmware platform_config on driver exit
-        */
-}
-
-int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
-{
-       u8 tx_ctrl_byte = on ? 0x0 : 0xF;
-       int ret = 0;
-
-       ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
-                        &tx_ctrl_byte, 1);
-       /* we expected 1, so consider 0 an error */
-       if (ret == 0)
-               ret = -EIO;
-       else if (ret == 1)
-               ret = 0;
-       return ret;
-}
-
-static int qual_power(struct hfi1_pportdata *ppd)
-{
-       u32 cable_power_class = 0, power_class_max = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-       int ret = 0;
-
-       ret = get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
-               SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
-       if (ret)
-               return ret;
-
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class > power_class_max)
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Port disabled due to system power restrictions\n",
-                       __func__);
-               ret = -EPERM;
-       }
-       return ret;
-}
-
-static int qual_bitrate(struct hfi1_pportdata *ppd)
-{
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
-           cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
-               ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
-
-       if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
-           cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
-               ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Cable failed bitrate check, disabling port\n",
-                       __func__);
-               return -EPERM;
-       }
-       return 0;
-}
-
-static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
-{
-       u8 cable_power_class = 0, power_ctrl_byte = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-       int ret;
-
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class > QSFP_POWER_CLASS_1) {
-               power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
-
-               power_ctrl_byte |= 1;
-               power_ctrl_byte &= ~(0x2);
-
-               ret = qsfp_write(ppd, ppd->dd->hfi1_id,
-                                QSFP_PWR_CTRL_BYTE_OFFS,
-                                &power_ctrl_byte, 1);
-               if (ret != 1)
-                       return -EIO;
-
-               if (cable_power_class > QSFP_POWER_CLASS_4) {
-                       power_ctrl_byte |= (1 << 2);
-                       ret = qsfp_write(ppd, ppd->dd->hfi1_id,
-                                        QSFP_PWR_CTRL_BYTE_OFFS,
-                                        &power_ctrl_byte, 1);
-                       if (ret != 1)
-                               return -EIO;
-               }
-
-               /* SFF 8679 rev 1.7 LPMode Deassert time */
-               msleep(300);
-       }
-       return 0;
-}
-
-static void apply_rx_cdr(struct hfi1_pportdata *ppd,
-                        u32 rx_preset_index,
-                        u8 *cdr_ctrl_byte)
-{
-       u32 rx_preset;
-       u8 *cache = ppd->qsfp_info.cache;
-       int cable_power_class;
-
-       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
-             (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
-               return;
-
-       /* RX CDR present, bypass supported */
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class <= QSFP_POWER_CLASS_3) {
-               /* Power class <= 3, ignore config & turn RX CDR on */
-               *cdr_ctrl_byte |= 0xF;
-               return;
-       }
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
-               &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: RX_CDR_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
-               &rx_preset, 4);
-
-       /* Expand cdr setting to all 4 lanes */
-       rx_preset = (rx_preset | (rx_preset << 1) |
-                       (rx_preset << 2) | (rx_preset << 3));
-
-       if (rx_preset) {
-               *cdr_ctrl_byte |= rx_preset;
-       } else {
-               *cdr_ctrl_byte &= rx_preset;
-               /* Preserve current TX CDR status */
-               *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
-       }
-}
-
-static void apply_tx_cdr(struct hfi1_pportdata *ppd,
-                        u32 tx_preset_index,
-                        u8 *cdr_ctrl_byte)
-{
-       u32 tx_preset;
-       u8 *cache = ppd->qsfp_info.cache;
-       int cable_power_class;
-
-       if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
-             (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
-               return;
-
-       /* TX CDR present, bypass supported */
-       cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
-
-       if (cable_power_class <= QSFP_POWER_CLASS_3) {
-               /* Power class <= 3, ignore config & turn TX CDR on */
-               *cdr_ctrl_byte |= 0xF0;
-               return;
-       }
-
-       get_platform_config_field(
-               ppd->dd,
-               PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
-               TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
-
-       if (!tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX_CDR_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd,
-               PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index,
-               TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
-
-       /* Expand cdr setting to all 4 lanes */
-       tx_preset = (tx_preset | (tx_preset << 1) |
-                       (tx_preset << 2) | (tx_preset << 3));
-
-       if (tx_preset)
-               *cdr_ctrl_byte |= (tx_preset << 4);
-       else
-               /* Preserve current/determined RX CDR status */
-               *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
-}
-
-static void apply_cdr_settings(
-               struct hfi1_pportdata *ppd, u32 rx_preset_index,
-               u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
-
-       apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
-
-       apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
-
-       qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-                  &cdr_ctrl_byte, 1);
-}
-
-static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u8 tx_eq;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
-               return;
-       /* Disable adaptive TX EQ if present */
-       tx_eq = cache[(128 * 3) + 241];
-       tx_eq &= 0xF0;
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
-}
-
-static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-       u32 tx_preset;
-       u8 tx_eq;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
-               return;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
-               &tx_preset, 4);
-       if (!tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX_EQ_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-                       tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
-                       &tx_preset, 4);
-
-       if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: TX EQ %x unsupported\n",
-                       __func__, tx_preset);
-
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Applying EQ %x\n",
-                       __func__, cache[608] & 0xF0);
-
-               tx_preset = (cache[608] & 0xF0) >> 4;
-       }
-
-       tx_eq = tx_preset | (tx_preset << 4);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
-}
-
-static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
-{
-       u32 rx_preset;
-       u8 rx_eq, *cache = ppd->qsfp_info.cache;
-
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
-               return;
-       get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-                       rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
-                       &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: RX_EMP_APPLY is set to disabled\n",
-                       __func__);
-               return;
-       }
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
-               rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
-               &rx_preset, 4);
-
-       if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Requested RX EMP %x\n",
-                       __func__, rx_preset);
-
-               dd_dev_info(
-                       ppd->dd,
-                       "%s: Applying supported EMP %x\n",
-                       __func__, cache[608] & 0xF);
-
-               rx_preset = cache[608] & 0xF;
-       }
-
-       rx_eq = rx_preset | (rx_preset << 4);
-
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
-}
-
-static void apply_eq_settings(struct hfi1_pportdata *ppd,
-                             u32 rx_preset_index, u32 tx_preset_index)
-{
-       u8 *cache = ppd->qsfp_info.cache;
-
-       /* no point going on w/o a page 3 */
-       if (cache[2] & 4) {
-               dd_dev_info(ppd->dd,
-                           "%s: Upper page 03 not present\n",
-                           __func__);
-               return;
-       }
-
-       apply_tx_eq_auto(ppd);
-
-       apply_tx_eq_prog(ppd, tx_preset_index);
-
-       apply_rx_eq_emp(ppd, rx_preset_index);
-}
-
-static void apply_rx_amplitude_settings(
-               struct hfi1_pportdata *ppd, u32 rx_preset_index,
-               u32 tx_preset_index)
-{
-       u32 rx_preset;
-       u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
-
-       /* no point going on w/o a page 3 */
-       if (cache[2] & 4) {
-               dd_dev_info(ppd->dd,
-                           "%s: Upper page 03 not present\n",
-                           __func__);
-               return;
-       }
-       if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
-               dd_dev_info(ppd->dd,
-                           "%s: RX_AMP_APPLY is set to disabled\n",
-                           __func__);
-               return;
-       }
-
-       get_platform_config_field(ppd->dd,
-                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
-                                 rx_preset_index,
-                                 RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
-                                 &rx_preset, 4);
-
-       if (!rx_preset) {
-               dd_dev_info(ppd->dd,
-                           "%s: RX_AMP_APPLY is set to disabled\n",
-                           __func__);
-               return;
-       }
-       get_platform_config_field(ppd->dd,
-                                 PLATFORM_CONFIG_RX_PRESET_TABLE,
-                                 rx_preset_index,
-                                 RX_PRESET_TABLE_QSFP_RX_AMP,
-                                 &rx_preset, 4);
-
-       dd_dev_info(ppd->dd,
-                   "%s: Requested RX AMP %x\n",
-                   __func__,
-                   rx_preset);
-
-       for (i = 0; i < 4; i++) {
-               if (cache[(128 * 3) + 225] & (1 << i)) {
-                       preferred = i;
-                       if (preferred == rx_preset)
-                               break;
-               }
-       }
-
-       /*
-        * Verify that preferred RX amplitude is not just a
-        * fall through of the default
-        */
-       if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
-               dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
-               return;
-       }
-
-       dd_dev_info(ppd->dd,
-                   "%s: Applying RX AMP %x\n", __func__, preferred);
-
-       rx_amp = preferred | (preferred << 4);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
-       qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
-}
-
-#define OPA_INVALID_INDEX 0xFFF
-
-static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
-                          u32 config_data, const char *message)
-{
-       u8 i;
-       int ret = HCMD_SUCCESS;
-
-       for (i = 0; i < 4; i++) {
-               ret = load_8051_config(ppd->dd, field_id, i, config_data);
-               if (ret != HCMD_SUCCESS) {
-                       dd_dev_err(
-                               ppd->dd,
-                               "%s: %s for lane %u failed\n",
-                               message, __func__, i);
-               }
-       }
-}
-
-static void apply_tunings(
-               struct hfi1_pportdata *ppd, u32 tx_preset_index,
-               u8 tuning_method, u32 total_atten, u8 limiting_active)
-{
-       int ret = 0;
-       u32 config_data = 0, tx_preset = 0;
-       u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       /* Enable external device config if channel is limiting active */
-       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                        GENERAL_CONFIG, &config_data);
-       config_data |= limiting_active;
-       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                              GENERAL_CONFIG, config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(
-                       ppd->dd,
-                       "%s: Failed to set enable external device config\n",
-                       __func__);
-
-       config_data = 0; /* re-init  */
-       /* Pass tuning method to 8051 */
-       read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
-                        &config_data);
-       config_data |= tuning_method;
-       ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
-                              config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
-                          __func__);
-
-       /* Set same channel loss for both TX and RX */
-       config_data = 0 | (total_atten << 16) | (total_atten << 24);
-       apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
-                      "Setting channel loss");
-
-       /* Inform 8051 of cable capabilities */
-       if (ppd->qsfp_info.cache_valid) {
-               external_device_config =
-                       ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
-                       ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
-                       ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
-                       (cache[QSFP_EQ_INFO_OFFS] & 0x4);
-               ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
-                                      GENERAL_CONFIG, &config_data);
-               /* Clear, then set the external device config field */
-               config_data &= ~(0xFF << 24);
-               config_data |= (external_device_config << 24);
-               ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
-                                      GENERAL_CONFIG, config_data);
-               if (ret != HCMD_SUCCESS)
-                       dd_dev_info(ppd->dd,
-                                   "%s: Failed set ext device config params\n",
-                                   __func__);
-       }
-
-       if (tx_preset_index == OPA_INVALID_INDEX) {
-               if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
-                       dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
-                                   __func__);
-               return;
-       }
-
-       /* Following for limiting active channels only */
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
-               TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
-       precur = tx_preset;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
-       attn = tx_preset;
-
-       get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
-               tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
-       postcur = tx_preset;
-
-       config_data = precur | (attn << 8) | (postcur << 16);
-
-       apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
-                      "Applying TX settings");
-}
-
-/* Must be holding the QSFP i2c resource */
-static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
-                           u32 *ptr_rx_preset, u32 *ptr_total_atten)
-{
-       int ret;
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       ppd->qsfp_info.limiting_active = 1;
-
-       ret = set_qsfp_tx(ppd, 0);
-       if (ret)
-               return ret;
-
-       ret = qual_power(ppd);
-       if (ret)
-               return ret;
-
-       ret = qual_bitrate(ppd);
-       if (ret)
-               return ret;
-
-       if (ppd->qsfp_info.reset_needed) {
-               reset_qsfp(ppd);
-               ppd->qsfp_info.reset_needed = 0;
-               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-       } else {
-               ppd->qsfp_info.reset_needed = 1;
-       }
-
-       ret = set_qsfp_high_power(ppd);
-       if (ret)
-               return ret;
-
-       if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
-                       ptr_tx_preset, 4);
-               if (ret) {
-                       *ptr_tx_preset = OPA_INVALID_INDEX;
-                       return ret;
-               }
-       } else {
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
-                       ptr_tx_preset, 4);
-               if (ret) {
-                       *ptr_tx_preset = OPA_INVALID_INDEX;
-                       return ret;
-               }
-       }
-
-       ret = get_platform_config_field(
-               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-               PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
-       if (ret) {
-               *ptr_rx_preset = OPA_INVALID_INDEX;
-               return ret;
-       }
-
-       if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
-       else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
-
-       apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
-
-       ret = set_qsfp_tx(ppd, 1);
-
-       return ret;
-}
-
-static int tune_qsfp(struct hfi1_pportdata *ppd,
-                    u32 *ptr_tx_preset, u32 *ptr_rx_preset,
-                    u8 *ptr_tuning_method, u32 *ptr_total_atten)
-{
-       u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
-       u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
-       int ret = 0;
-       u8 *cache = ppd->qsfp_info.cache;
-
-       switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
-       case 0xA ... 0xB:
-               ret = get_platform_config_field(
-                       ppd->dd,
-                       PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G,
-                       &platform_atten, 4);
-               if (ret)
-                       return ret;
-
-               if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
-                       cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
-               else if ((lss & OPA_LINK_SPEED_12_5G) &&
-                        (lse & OPA_LINK_SPEED_12_5G))
-                       cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
-
-               /* Fallback to configured attenuation if cable memory is bad */
-               if (cable_atten == 0 || cable_atten > 36) {
-                       ret = get_platform_config_field(
-                               ppd->dd,
-                               PLATFORM_CONFIG_SYSTEM_TABLE, 0,
-                               SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
-                               &cable_atten, 4);
-                       if (ret)
-                               return ret;
-               }
-
-               ret = get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
-               if (ret)
-                       return ret;
-
-               *ptr_total_atten = platform_atten + cable_atten + remote_atten;
-
-               *ptr_tuning_method = OPA_PASSIVE_TUNING;
-               break;
-       case 0x0 ... 0x9: /* fallthrough */
-       case 0xC: /* fallthrough */
-       case 0xE:
-               ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
-                                      ptr_total_atten);
-               if (ret)
-                       return ret;
-
-               *ptr_tuning_method = OPA_ACTIVE_TUNING;
-               break;
-       case 0xD: /* fallthrough */
-       case 0xF:
-       default:
-               dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
-                           __func__);
-               break;
-       }
-       return ret;
-}
-
-/*
- * This function communicates its success or failure via ppd->driver_link_ready
- * Thus, it depends on its association with start_link(...) which checks
- * driver_link_ready before proceeding with the link negotiation and
- * initialization process.
- */
-void tune_serdes(struct hfi1_pportdata *ppd)
-{
-       int ret = 0;
-       u32 total_atten = 0;
-       u32 remote_atten = 0, platform_atten = 0;
-       u32 rx_preset_index, tx_preset_index;
-       u8 tuning_method = 0, limiting_active = 0;
-       struct hfi1_devdata *dd = ppd->dd;
-
-       rx_preset_index = OPA_INVALID_INDEX;
-       tx_preset_index = OPA_INVALID_INDEX;
-
-       /* the link defaults to enabled */
-       ppd->link_enabled = 1;
-       /* the driver link ready state defaults to not ready */
-       ppd->driver_link_ready = 0;
-       ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
-
-       /* Skip the tuning for testing (loopback != none) and simulations */
-       if (loopback != LOOPBACK_NONE ||
-           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
-               ppd->driver_link_ready = 1;
-               return;
-       }
-
-       ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                                       PORT_TABLE_PORT_TYPE, &ppd->port_type,
-                                       4);
-       if (ret)
-               ppd->port_type = PORT_TYPE_UNKNOWN;
-
-       switch (ppd->port_type) {
-       case PORT_TYPE_DISCONNECTED:
-               ppd->offline_disabled_reason =
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
-               dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
-                           __func__);
-               goto bail;
-       case PORT_TYPE_FIXED:
-               /* platform_atten, remote_atten pre-zeroed to catch error */
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
-
-               get_platform_config_field(
-                       ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                       PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
-
-               total_atten = platform_atten + remote_atten;
-
-               tuning_method = OPA_PASSIVE_TUNING;
-               break;
-       case PORT_TYPE_VARIABLE:
-               if (qsfp_mod_present(ppd)) {
-                       /*
-                        * platform_atten, remote_atten pre-zeroed to
-                        * catch error
-                        */
-                       get_platform_config_field(
-                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                               PORT_TABLE_LOCAL_ATTEN_25G,
-                               &platform_atten, 4);
-
-                       get_platform_config_field(
-                               ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-                               PORT_TABLE_REMOTE_ATTEN_25G,
-                               &remote_atten, 4);
-
-                       total_atten = platform_atten + remote_atten;
-
-                       tuning_method = OPA_PASSIVE_TUNING;
-               } else {
-                       ppd->offline_disabled_reason =
-                            HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
-                       goto bail;
-               }
-               break;
-       case PORT_TYPE_QSFP:
-               if (qsfp_mod_present(ppd)) {
-                       ret = acquire_chip_resource(ppd->dd,
-                                                   qsfp_resource(ppd->dd),
-                                                   QSFP_WAIT);
-                       if (ret) {
-                               dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
-                                          __func__, (int)ppd->dd->hfi1_id);
-                               goto bail;
-                       }
-                       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-
-                       if (ppd->qsfp_info.cache_valid) {
-                               ret = tune_qsfp(ppd,
-                                               &tx_preset_index,
-                                               &rx_preset_index,
-                                               &tuning_method,
-                                               &total_atten);
-
-                               /*
-                                * We may have modified the QSFP memory, so
-                                * update the cache to reflect the changes
-                                */
-                               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-                               limiting_active =
-                                               ppd->qsfp_info.limiting_active;
-                       } else {
-                               dd_dev_err(dd,
-                                          "%s: Reading QSFP memory failed\n",
-                                          __func__);
-                               ret = -EINVAL; /* a fail indication */
-                       }
-                       release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
-                       if (ret)
-                               goto bail;
-               } else {
-                       ppd->offline_disabled_reason =
-                          HFI1_ODR_MASK(
-                               OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
-                       goto bail;
-               }
-               break;
-       default:
-               dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
-               ppd->port_type = PORT_TYPE_UNKNOWN;
-               tuning_method = OPA_UNKNOWN_TUNING;
-               total_atten = 0;
-               limiting_active = 0;
-               tx_preset_index = OPA_INVALID_INDEX;
-               break;
-       }
-
-       if (ppd->offline_disabled_reason ==
-                       HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
-               apply_tunings(ppd, tx_preset_index, tuning_method,
-                             total_atten, limiting_active);
-
-       if (!ret)
-               ppd->driver_link_ready = 1;
-
-       return;
-bail:
-       ppd->driver_link_ready = 0;
-}
diff --git a/drivers/staging/rdma/hfi1/platform.h b/drivers/staging/rdma/hfi1/platform.h
deleted file mode 100644 (file)
index 19620cf..0000000
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#ifndef __PLATFORM_H
-#define __PLATFORM_H
-
-#define METADATA_TABLE_FIELD_START_SHIFT               0
-#define METADATA_TABLE_FIELD_START_LEN_BITS            15
-#define METADATA_TABLE_FIELD_LEN_SHIFT                 16
-#define METADATA_TABLE_FIELD_LEN_LEN_BITS              16
-
-/* Header structure */
-#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT                        0
-#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS             6
-#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT              16
-#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS           12
-#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT                        28
-#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS             4
-
-enum platform_config_table_type_encoding {
-       PLATFORM_CONFIG_TABLE_RESERVED,
-       PLATFORM_CONFIG_SYSTEM_TABLE,
-       PLATFORM_CONFIG_PORT_TABLE,
-       PLATFORM_CONFIG_RX_PRESET_TABLE,
-       PLATFORM_CONFIG_TX_PRESET_TABLE,
-       PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
-       PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
-       PLATFORM_CONFIG_TABLE_MAX
-};
-
-enum platform_config_system_table_fields {
-       SYSTEM_TABLE_RESERVED,
-       SYSTEM_TABLE_NODE_STRING,
-       SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
-       SYSTEM_TABLE_NODE_GUID,
-       SYSTEM_TABLE_REVISION,
-       SYSTEM_TABLE_VENDOR_OUI,
-       SYSTEM_TABLE_META_VERSION,
-       SYSTEM_TABLE_DEVICE_ID,
-       SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
-       SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
-       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
-       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
-       SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
-       SYSTEM_TABLE_MAX
-};
-
-enum platform_config_port_table_fields {
-       PORT_TABLE_RESERVED,
-       PORT_TABLE_PORT_TYPE,
-       PORT_TABLE_LOCAL_ATTEN_12G,
-       PORT_TABLE_LOCAL_ATTEN_25G,
-       PORT_TABLE_LINK_SPEED_SUPPORTED,
-       PORT_TABLE_LINK_WIDTH_SUPPORTED,
-       PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
-       PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
-       PORT_TABLE_VL_CAP,
-       PORT_TABLE_MTU_CAP,
-       PORT_TABLE_TX_LANE_ENABLE_MASK,
-       PORT_TABLE_LOCAL_MAX_TIMEOUT,
-       PORT_TABLE_REMOTE_ATTEN_12G,
-       PORT_TABLE_REMOTE_ATTEN_25G,
-       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
-       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
-       PORT_TABLE_RX_PRESET_IDX,
-       PORT_TABLE_CABLE_REACH_CLASS,
-       PORT_TABLE_MAX
-};
-
-enum platform_config_rx_preset_table_fields {
-       RX_PRESET_TABLE_RESERVED,
-       RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
-       RX_PRESET_TABLE_QSFP_RX_CDR,
-       RX_PRESET_TABLE_QSFP_RX_EMP,
-       RX_PRESET_TABLE_QSFP_RX_AMP,
-       RX_PRESET_TABLE_MAX
-};
-
-enum platform_config_tx_preset_table_fields {
-       TX_PRESET_TABLE_RESERVED,
-       TX_PRESET_TABLE_PRECUR,
-       TX_PRESET_TABLE_ATTN,
-       TX_PRESET_TABLE_POSTCUR,
-       TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
-       TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
-       TX_PRESET_TABLE_QSFP_TX_CDR,
-       TX_PRESET_TABLE_QSFP_TX_EQ,
-       TX_PRESET_TABLE_MAX
-};
-
-enum platform_config_qsfp_attn_table_fields {
-       QSFP_ATTEN_TABLE_RESERVED,
-       QSFP_ATTEN_TABLE_TX_PRESET_IDX,
-       QSFP_ATTEN_TABLE_RX_PRESET_IDX,
-       QSFP_ATTEN_TABLE_MAX
-};
-
-enum platform_config_variable_settings_table_fields {
-       VARIABLE_SETTINGS_TABLE_RESERVED,
-       VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
-       VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
-       VARIABLE_SETTINGS_TABLE_MAX
-};
-
-struct platform_config {
-       size_t size;
-       const u8 *data;
-};
-
-struct platform_config_data {
-       u32 *table;
-       u32 *table_metadata;
-       u32 num_table;
-};
-
-/*
- * This struct acts as a quick reference into the platform_data binary image
- * and is populated by parse_platform_config(...) depending on the specific
- * META_VERSION
- */
-struct platform_config_cache {
-       u8  cache_valid;
-       struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
-};
-
-static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
-       0,
-       SYSTEM_TABLE_MAX,
-       PORT_TABLE_MAX,
-       RX_PRESET_TABLE_MAX,
-       TX_PRESET_TABLE_MAX,
-       QSFP_ATTEN_TABLE_MAX,
-       VARIABLE_SETTINGS_TABLE_MAX
-};
-
-/* This section defines default values and encodings for the
- * fields defined for each table above
- */
-
-/*
- * =====================================================
- *  System table encodings
- * =====================================================
- */
-#define PLATFORM_CONFIG_MAGIC_NUM              0x3d4f5041
-#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN       4
-
-/*
- * These power classes are the same as defined in SFF 8636 spec rev 2.4
- * describing byte 129 in table 6-16, except enumerated in a different order
- */
-enum platform_config_qsfp_power_class_encoding {
-       QSFP_POWER_CLASS_1 = 1,
-       QSFP_POWER_CLASS_2,
-       QSFP_POWER_CLASS_3,
-       QSFP_POWER_CLASS_4,
-       QSFP_POWER_CLASS_5,
-       QSFP_POWER_CLASS_6,
-       QSFP_POWER_CLASS_7
-};
-
-/*
- * ====================================================
- *  Port table encodings
- * ====================================================
- */
-enum platform_config_port_type_encoding {
-       PORT_TYPE_UNKNOWN,
-       PORT_TYPE_DISCONNECTED,
-       PORT_TYPE_FIXED,
-       PORT_TYPE_VARIABLE,
-       PORT_TYPE_QSFP,
-       PORT_TYPE_MAX
-};
-
-enum platform_config_link_speed_supported_encoding {
-       LINK_SPEED_SUPP_12G = 1,
-       LINK_SPEED_SUPP_25G,
-       LINK_SPEED_SUPP_12G_25G,
-       LINK_SPEED_SUPP_MAX
-};
-
-/*
- * This is a subset (not strict) of the link downgrades
- * supported. The link downgrades supported are expected
- * to be supplied to the driver by another entity such as
- * the fabric manager
- */
-enum platform_config_link_width_supported_encoding {
-       LINK_WIDTH_SUPP_1X = 1,
-       LINK_WIDTH_SUPP_2X,
-       LINK_WIDTH_SUPP_2X_1X,
-       LINK_WIDTH_SUPP_3X,
-       LINK_WIDTH_SUPP_3X_1X,
-       LINK_WIDTH_SUPP_3X_2X,
-       LINK_WIDTH_SUPP_3X_2X_1X,
-       LINK_WIDTH_SUPP_4X,
-       LINK_WIDTH_SUPP_4X_1X,
-       LINK_WIDTH_SUPP_4X_2X,
-       LINK_WIDTH_SUPP_4X_2X_1X,
-       LINK_WIDTH_SUPP_4X_3X,
-       LINK_WIDTH_SUPP_4X_3X_1X,
-       LINK_WIDTH_SUPP_4X_3X_2X,
-       LINK_WIDTH_SUPP_4X_3X_2X_1X,
-       LINK_WIDTH_SUPP_MAX
-};
-
-enum platform_config_virtual_lane_capability_encoding {
-       VL_CAP_VL0 = 1,
-       VL_CAP_VL0_1,
-       VL_CAP_VL0_2,
-       VL_CAP_VL0_3,
-       VL_CAP_VL0_4,
-       VL_CAP_VL0_5,
-       VL_CAP_VL0_6,
-       VL_CAP_VL0_7,
-       VL_CAP_VL0_8,
-       VL_CAP_VL0_9,
-       VL_CAP_VL0_10,
-       VL_CAP_VL0_11,
-       VL_CAP_VL0_12,
-       VL_CAP_VL0_13,
-       VL_CAP_VL0_14,
-       VL_CAP_MAX
-};
-
-/* Max MTU */
-enum platform_config_mtu_capability_encoding {
-       MTU_CAP_256   = 1,
-       MTU_CAP_512   = 2,
-       MTU_CAP_1024  = 3,
-       MTU_CAP_2048  = 4,
-       MTU_CAP_4096  = 5,
-       MTU_CAP_8192  = 6,
-       MTU_CAP_10240 = 7
-};
-
-enum platform_config_local_max_timeout_encoding {
-       LOCAL_MAX_TIMEOUT_10_MS = 1,
-       LOCAL_MAX_TIMEOUT_100_MS,
-       LOCAL_MAX_TIMEOUT_1_S,
-       LOCAL_MAX_TIMEOUT_10_S,
-       LOCAL_MAX_TIMEOUT_100_S,
-       LOCAL_MAX_TIMEOUT_1000_S
-};
-
-enum link_tuning_encoding {
-       OPA_PASSIVE_TUNING,
-       OPA_ACTIVE_TUNING,
-       OPA_UNKNOWN_TUNING
-};
-
-/* platform.c */
-void get_platform_config(struct hfi1_devdata *dd);
-void free_platform_config(struct hfi1_devdata *dd);
-int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
-void tune_serdes(struct hfi1_pportdata *ppd);
-
-#endif                 /*__PLATFORM_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
deleted file mode 100644 (file)
index 91eb423..0000000
+++ /dev/null
@@ -1,977 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/err.h>
-#include <linux/vmalloc.h>
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-
-#include "hfi.h"
-#include "qp.h"
-#include "trace.h"
-#include "verbs_txreq.h"
-
-unsigned int hfi1_qp_table_size = 256;
-module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
-MODULE_PARM_DESC(qp_table_size, "QP table size");
-
-static void flush_tx_list(struct rvt_qp *qp);
-static int iowait_sleep(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *stx,
-       unsigned seq);
-static void iowait_wakeup(struct iowait *wait, int reason);
-static void iowait_sdma_drained(struct iowait *wait);
-static void qp_pio_drain(struct rvt_qp *qp);
-
-static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
-                             struct rvt_qpn_map *map, unsigned off)
-{
-       return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
-}
-
-/*
- * Convert the AETH credit code into the number of credits.
- */
-static const u16 credit_table[31] = {
-       0,                      /* 0 */
-       1,                      /* 1 */
-       2,                      /* 2 */
-       3,                      /* 3 */
-       4,                      /* 4 */
-       6,                      /* 5 */
-       8,                      /* 6 */
-       12,                     /* 7 */
-       16,                     /* 8 */
-       24,                     /* 9 */
-       32,                     /* A */
-       48,                     /* B */
-       64,                     /* C */
-       96,                     /* D */
-       128,                    /* E */
-       192,                    /* F */
-       256,                    /* 10 */
-       384,                    /* 11 */
-       512,                    /* 12 */
-       768,                    /* 13 */
-       1024,                   /* 14 */
-       1536,                   /* 15 */
-       2048,                   /* 16 */
-       3072,                   /* 17 */
-       4096,                   /* 18 */
-       6144,                   /* 19 */
-       8192,                   /* 1A */
-       12288,                  /* 1B */
-       16384,                  /* 1C */
-       24576,                  /* 1D */
-       32768                   /* 1E */
-};
-
-static void flush_tx_list(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       while (!list_empty(&priv->s_iowait.tx_head)) {
-               struct sdma_txreq *tx;
-
-               tx = list_first_entry(
-                       &priv->s_iowait.tx_head,
-                       struct sdma_txreq,
-                       list);
-               list_del_init(&tx->list);
-               hfi1_put_txreq(
-                       container_of(tx, struct verbs_txreq, txreq));
-       }
-}
-
-static void flush_iowait(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
-
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       if (!list_empty(&priv->s_iowait.list)) {
-               list_del_init(&priv->s_iowait.list);
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-}
-
-static inline int opa_mtu_enum_to_int(int mtu)
-{
-       switch (mtu) {
-       case OPA_MTU_8192:  return 8192;
-       case OPA_MTU_10240: return 10240;
-       default:            return -1;
-       }
-}
-
-/**
- * This function is what we would push to the core layer if we wanted to be a
- * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
- * to blindly pass the MTU enum value from the PathRecord to us.
- *
- * The actual flag used to determine "8k MTU" will change and is currently
- * unknown.
- */
-static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
-{
-       int val;
-
-       /* Constraining 10KB packets to 8KB packets */
-       if (mtu == (enum ib_mtu)OPA_MTU_10240)
-               mtu = OPA_MTU_8192;
-       val = opa_mtu_enum_to_int((int)mtu);
-       if (val > 0)
-               return val;
-       return ib_mtu_enum_to_int(mtu);
-}
-
-int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                        int attr_mask, struct ib_udata *udata)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct hfi1_ibdev *dev = to_idev(ibqp->device);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       u8 sc;
-
-       if (attr_mask & IB_QP_AV) {
-               sc = ah_to_sc(ibqp->device, &attr->ah_attr);
-               if (sc == 0xf)
-                       return -EINVAL;
-
-               if (!qp_to_sdma_engine(qp, sc) &&
-                   dd->flags & HFI1_HAS_SEND_DMA)
-                       return -EINVAL;
-
-               if (!qp_to_send_context(qp, sc))
-                       return -EINVAL;
-       }
-
-       if (attr_mask & IB_QP_ALT_PATH) {
-               sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
-               if (sc == 0xf)
-                       return -EINVAL;
-
-               if (!qp_to_sdma_engine(qp, sc) &&
-                   dd->flags & HFI1_HAS_SEND_DMA)
-                       return -EINVAL;
-
-               if (!qp_to_send_context(qp, sc))
-                       return -EINVAL;
-       }
-
-       return 0;
-}
-
-void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (attr_mask & IB_QP_AV) {
-               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
-               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       }
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE &&
-           attr->path_mig_state == IB_MIG_MIGRATED &&
-           qp->s_mig_state == IB_MIG_ARMED) {
-               qp->s_flags |= RVT_S_AHG_CLEAR;
-               priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
-               priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-               priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       }
-}
-
-/**
- * hfi1_check_send_wqe - validate wqe
- * @qp - The qp
- * @wqe - The built wqe
- *
- * validate wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
- *
- * Returns 0 on success, -EINVAL on failure
- *
- */
-int hfi1_check_send_wqe(struct rvt_qp *qp,
-                       struct rvt_swqe *wqe)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct rvt_ah *ah;
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-       case IB_QPT_UC:
-               if (wqe->length > 0x80000000U)
-                       return -EINVAL;
-               break;
-       case IB_QPT_SMI:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
-               if (wqe->length > (1 << ah->log_pmtu))
-                       return -EINVAL;
-               break;
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
-               if (wqe->length > (1 << ah->log_pmtu))
-                       return -EINVAL;
-               if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
-                       return -EINVAL;
-       default:
-               break;
-       }
-       return wqe->length <= piothreshold;
-}
-
-/**
- * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 hfi1_compute_aeth(struct rvt_qp *qp)
-{
-       u32 aeth = qp->r_msn & HFI1_MSN_MASK;
-
-       if (qp->ibqp.srq) {
-               /*
-                * Shared receive queues don't generate credits.
-                * Set the credit field to the invalid value.
-                */
-               aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
-       } else {
-               u32 min, max, x;
-               u32 credits;
-               struct rvt_rwq *wq = qp->r_rq.wq;
-               u32 head;
-               u32 tail;
-
-               /* sanity check pointers before trusting them */
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               /*
-                * Compute the number of credits available (RWQEs).
-                * There is a small chance that the pair of reads are
-                * not atomic, which is OK, since the fuzziness is
-                * resolved as further ACKs go out.
-                */
-               credits = head - tail;
-               if ((int)credits < 0)
-                       credits += qp->r_rq.size;
-               /*
-                * Binary search the credit table to find the code to
-                * use.
-                */
-               min = 0;
-               max = 31;
-               for (;;) {
-                       x = (min + max) / 2;
-                       if (credit_table[x] == credits)
-                               break;
-                       if (credit_table[x] > credits) {
-                               max = x;
-                       } else {
-                               if (min == x)
-                                       break;
-                               min = x;
-                       }
-               }
-               aeth |= x << HFI1_AETH_CREDIT_SHIFT;
-       }
-       return cpu_to_be32(aeth);
-}
-
-/**
- * _hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress w/o regard to the s_flags.
- *
- * It is only used in the post send, which doesn't hold
- * the s_lock.
- */
-void _hfi1_schedule_send(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibport *ibp =
-               to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-
-       iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
-                       priv->s_sde ?
-                       priv->s_sde->cpu :
-                       cpumask_first(cpumask_of_node(dd->node)));
-}
-
-static void qp_pio_drain(struct rvt_qp *qp)
-{
-       struct hfi1_ibdev *dev;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (!priv->s_sendcontext)
-               return;
-       dev = to_idev(qp->ibqp.device);
-       while (iowait_pio_pending(&priv->s_iowait)) {
-               write_seqlock_irq(&dev->iowait_lock);
-               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
-               write_sequnlock_irq(&dev->iowait_lock);
-               iowait_pio_drain(&priv->s_iowait);
-               write_seqlock_irq(&dev->iowait_lock);
-               hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
-               write_sequnlock_irq(&dev->iowait_lock);
-       }
-}
-
-/**
- * hfi1_schedule_send - schedule progress
- * @qp: the QP
- *
- * This schedules qp progress and caller should hold
- * the s_lock.
- */
-void hfi1_schedule_send(struct rvt_qp *qp)
-{
-       if (hfi1_send_ok(qp))
-               _hfi1_schedule_send(qp);
-}
-
-/**
- * hfi1_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void hfi1_get_credit(struct rvt_qp *qp, u32 aeth)
-{
-       u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
-
-       /*
-        * If the credit is invalid, we can send
-        * as many packets as we like.  Otherwise, we have to
-        * honor the credit field.
-        */
-       if (credit == HFI1_AETH_CREDIT_INVAL) {
-               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
-                       qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
-                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
-                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
-                               hfi1_schedule_send(qp);
-                       }
-               }
-       } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
-               /* Compute new LSN (i.e., MSN + credit) */
-               credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
-               if (cmp_msn(credit, qp->s_lsn) > 0) {
-                       qp->s_lsn = credit;
-                       if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
-                               qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
-                               hfi1_schedule_send(qp);
-                       }
-               }
-       }
-}
-
-void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (qp->s_flags & flag) {
-               qp->s_flags &= ~flag;
-               trace_hfi1_qpwakeup(qp, flag);
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       /* Notify hfi1_destroy_qp() if it is waiting. */
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static int iowait_sleep(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *stx,
-       unsigned seq)
-{
-       struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
-       struct rvt_qp *qp;
-       struct hfi1_qp_priv *priv;
-       unsigned long flags;
-       int ret = 0;
-       struct hfi1_ibdev *dev;
-
-       qp = tx->qp;
-       priv = qp->priv;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               /*
-                * If we couldn't queue the DMA request, save the info
-                * and try again later rather than destroying the
-                * buffer and undoing the side effects of the copy.
-                */
-               /* Make a common routine? */
-               dev = &sde->dd->verbs_dev;
-               list_add_tail(&stx->list, &wait->tx_head);
-               write_seqlock(&dev->iowait_lock);
-               if (sdma_progress(sde, seq, stx))
-                       goto eagain;
-               if (list_empty(&priv->s_iowait.list)) {
-                       struct hfi1_ibport *ibp =
-                               to_iport(qp->ibqp.device, qp->port_num);
-
-                       ibp->rvp.n_dmawait++;
-                       qp->s_flags |= RVT_S_WAIT_DMA_DESC;
-                       list_add_tail(&priv->s_iowait.list, &sde->dmawait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
-                       atomic_inc(&qp->refcount);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               ret = -EBUSY;
-       } else {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               hfi1_put_txreq(tx);
-       }
-       return ret;
-eagain:
-       write_sequnlock(&dev->iowait_lock);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       list_del_init(&stx->list);
-       return -EAGAIN;
-}
-
-static void iowait_wakeup(struct iowait *wait, int reason)
-{
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       WARN_ON(reason != SDMA_AVAIL_REASON);
-       hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
-}
-
-static void iowait_sdma_drained(struct iowait *wait)
-{
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       /*
-        * This happens when the send engine notes
-        * a QP in the error state and cannot
-        * do the flush work until that QP's
-        * sdma work has finished.
-        */
-       spin_lock(&qp->s_lock);
-       if (qp->s_flags & RVT_S_WAIT_DMA) {
-               qp->s_flags &= ~RVT_S_WAIT_DMA;
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock(&qp->s_lock);
-}
-
-/**
- *
- * qp_to_sdma_engine - map a qp to a send engine
- * @qp: the QP
- * @sc5: the 5 bit sc
- *
- * Return:
- * A send engine for the qp or NULL for SMI type qp.
- */
-struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct sdma_engine *sde;
-
-       if (!(dd->flags & HFI1_HAS_SEND_DMA))
-               return NULL;
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               return NULL;
-       default:
-               break;
-       }
-       sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
-       return sde;
-}
-
-/*
- * qp_to_send_context - map a qp to a send context
- * @qp: the QP
- * @sc5: the 5 bit sc
- *
- * Return:
- * A send context for the qp
- */
-struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               /* SMA packets to VL15 */
-               return dd->vld[15].sc;
-       default:
-               break;
-       }
-
-       return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
-                                         sc5);
-}
-
-struct qp_iter {
-       struct hfi1_ibdev *dev;
-       struct rvt_qp *qp;
-       int specials;
-       int n;
-};
-
-struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
-{
-       struct qp_iter *iter;
-
-       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-       if (!iter)
-               return NULL;
-
-       iter->dev = dev;
-       iter->specials = dev->rdi.ibdev.phys_port_cnt * 2;
-       if (qp_iter_next(iter)) {
-               kfree(iter);
-               return NULL;
-       }
-
-       return iter;
-}
-
-int qp_iter_next(struct qp_iter *iter)
-{
-       struct hfi1_ibdev *dev = iter->dev;
-       int n = iter->n;
-       int ret = 1;
-       struct rvt_qp *pqp = iter->qp;
-       struct rvt_qp *qp;
-
-       /*
-        * The approach is to consider the special qps
-        * as an additional table entries before the
-        * real hash table.  Since the qp code sets
-        * the qp->next hash link to NULL, this works just fine.
-        *
-        * iter->specials is 2 * # ports
-        *
-        * n = 0..iter->specials is the special qp indices
-        *
-        * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are
-        * the potential hash bucket entries
-        *
-        */
-       for (; n <  dev->rdi.qp_dev->qp_table_size + iter->specials; n++) {
-               if (pqp) {
-                       qp = rcu_dereference(pqp->next);
-               } else {
-                       if (n < iter->specials) {
-                               struct hfi1_pportdata *ppd;
-                               struct hfi1_ibport *ibp;
-                               int pidx;
-
-                               pidx = n % dev->rdi.ibdev.phys_port_cnt;
-                               ppd = &dd_from_dev(dev)->pport[pidx];
-                               ibp = &ppd->ibport_data;
-
-                               if (!(n & 1))
-                                       qp = rcu_dereference(ibp->rvp.qp[0]);
-                               else
-                                       qp = rcu_dereference(ibp->rvp.qp[1]);
-                       } else {
-                               qp = rcu_dereference(
-                                       dev->rdi.qp_dev->qp_table[
-                                               (n - iter->specials)]);
-                       }
-               }
-               pqp = qp;
-               if (qp) {
-                       iter->qp = qp;
-                       iter->n = n;
-                       return 0;
-               }
-       }
-       return ret;
-}
-
-static const char * const qp_type_str[] = {
-       "SMI", "GSI", "RC", "UC", "UD",
-};
-
-static int qp_idle(struct rvt_qp *qp)
-{
-       return
-               qp->s_last == qp->s_acked &&
-               qp->s_acked == qp->s_cur &&
-               qp->s_cur == qp->s_tail &&
-               qp->s_tail == qp->s_head;
-}
-
-void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
-{
-       struct rvt_swqe *wqe;
-       struct rvt_qp *qp = iter->qp;
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct sdma_engine *sde;
-       struct send_context *send_context;
-
-       sde = qp_to_sdma_engine(qp, priv->s_sc);
-       wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-       send_context = qp_to_send_context(qp, priv->s_sc);
-       seq_printf(s,
-                  "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
-                  iter->n,
-                  qp_idle(qp) ? "I" : "B",
-                  qp->ibqp.qp_num,
-                  atomic_read(&qp->refcount),
-                  qp_type_str[qp->ibqp.qp_type],
-                  qp->state,
-                  wqe ? wqe->wr.opcode : 0,
-                  qp->s_hdrwords,
-                  qp->s_flags,
-                  iowait_sdma_pending(&priv->s_iowait),
-                  iowait_pio_pending(&priv->s_iowait),
-                  !list_empty(&priv->s_iowait.list),
-                  qp->timeout,
-                  wqe ? wqe->ssn : 0,
-                  qp->s_lsn,
-                  qp->s_last_psn,
-                  qp->s_psn, qp->s_next_psn,
-                  qp->s_sending_psn, qp->s_sending_hpsn,
-                  qp->s_last, qp->s_acked, qp->s_cur,
-                  qp->s_tail, qp->s_head, qp->s_size,
-                  qp->s_avail,
-                  qp->remote_qpn,
-                  qp->remote_ah_attr.dlid,
-                  qp->remote_ah_attr.sl,
-                  qp->pmtu,
-                  qp->s_retry,
-                  qp->s_retry_cnt,
-                  qp->s_rnr_retry_cnt,
-                  sde,
-                  sde ? sde->this_idx : 0,
-                  send_context,
-                  send_context ? send_context->sw_index : 0,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
-                  qp->pid);
-}
-
-void qp_comm_est(struct rvt_qp *qp)
-{
-       qp->r_flags |= RVT_R_COMM_EST;
-       if (qp->ibqp.event_handler) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_COMM_EST;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                   gfp_t gfp)
-{
-       struct hfi1_qp_priv *priv;
-
-       priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node);
-       if (!priv)
-               return ERR_PTR(-ENOMEM);
-
-       priv->owner = qp;
-
-       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
-       if (!priv->s_hdr) {
-               kfree(priv);
-               return ERR_PTR(-ENOMEM);
-       }
-       setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp);
-       qp->s_timer.function = hfi1_rc_timeout;
-       return priv;
-}
-
-void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       kfree(priv->s_hdr);
-       kfree(priv);
-}
-
-unsigned free_all_qps(struct rvt_dev_info *rdi)
-{
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       int n;
-       unsigned qp_inuse = 0;
-
-       for (n = 0; n < dd->num_pports; n++) {
-               struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
-
-               rcu_read_lock();
-               if (rcu_dereference(ibp->rvp.qp[0]))
-                       qp_inuse++;
-               if (rcu_dereference(ibp->rvp.qp[1]))
-                       qp_inuse++;
-               rcu_read_unlock();
-       }
-
-       return qp_inuse;
-}
-
-void flush_qp_waiters(struct rvt_qp *qp)
-{
-       flush_iowait(qp);
-       hfi1_stop_rc_timers(qp);
-}
-
-void stop_send_queue(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       cancel_work_sync(&priv->s_iowait.iowork);
-       hfi1_del_timers_sync(qp);
-}
-
-void quiesce_qp(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       iowait_sdma_drain(&priv->s_iowait);
-       qp_pio_drain(qp);
-       flush_tx_list(qp);
-}
-
-void notify_qp_reset(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       iowait_init(
-               &priv->s_iowait,
-               1,
-               _hfi1_do_send,
-               iowait_sleep,
-               iowait_wakeup,
-               iowait_sdma_drained);
-       priv->r_adefered = 0;
-       clear_ahg(qp);
-}
-
-/*
- * Switch to alternate path.
- * The QP s_lock should be held and interrupts disabled.
- */
-void hfi1_migrate_qp(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct ib_event ev;
-
-       qp->s_mig_state = IB_MIG_MIGRATED;
-       qp->remote_ah_attr = qp->alt_ah_attr;
-       qp->port_num = qp->alt_ah_attr.port_num;
-       qp->s_pkey_index = qp->s_alt_pkey_index;
-       qp->s_flags |= RVT_S_AHG_CLEAR;
-       priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
-       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-
-       ev.device = qp->ibqp.device;
-       ev.element.qp = &qp->ibqp;
-       ev.event = IB_EVENT_PATH_MIG;
-       qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-}
-
-int mtu_to_path_mtu(u32 mtu)
-{
-       return mtu_to_enum(mtu, OPA_MTU_8192);
-}
-
-u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
-{
-       u32 mtu;
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       struct hfi1_ibport *ibp;
-       u8 sc, vl;
-
-       ibp = &dd->pport[qp->port_num - 1].ibport_data;
-       sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-       vl = sc_to_vlt(dd, sc);
-
-       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
-       if (vl < PER_VL_SEND_CONTEXTS)
-               mtu = min_t(u32, mtu, dd->vld[vl].mtu);
-       return mtu;
-}
-
-int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                      struct ib_qp_attr *attr)
-{
-       int mtu, pidx = qp->port_num - 1;
-       struct hfi1_ibdev *verbs_dev = container_of(rdi,
-                                                   struct hfi1_ibdev,
-                                                   rdi);
-       struct hfi1_devdata *dd = container_of(verbs_dev,
-                                              struct hfi1_devdata,
-                                              verbs_dev);
-       mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
-       if (mtu == -1)
-               return -1; /* values less than 0 are error */
-
-       if (mtu > dd->pport[pidx].ibmtu)
-               return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
-       else
-               return attr->path_mtu;
-}
-
-void notify_error_qp(struct rvt_qp *qp)
-{
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       write_seqlock(&dev->iowait_lock);
-       if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
-               qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
-               list_del_init(&priv->s_iowait.list);
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       write_sequnlock(&dev->iowait_lock);
-
-       if (!(qp->s_flags & RVT_S_BUSY)) {
-               qp->s_hdrwords = 0;
-               if (qp->s_rdma_mr) {
-                       rvt_put_mr(qp->s_rdma_mr);
-                       qp->s_rdma_mr = NULL;
-               }
-               flush_tx_list(qp);
-       }
-}
-
-/**
- * hfi1_error_port_qps - put a port's RC/UC qps into error state
- * @ibp: the ibport.
- * @sl: the service level.
- *
- * This function places all RC/UC qps with a given service level into error
- * state. It is generally called to force upper lay apps to abandon stale qps
- * after an sl->sc mapping change.
- */
-void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
-{
-       struct rvt_qp *qp = NULL;
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
-       int n;
-       int lastwqe;
-       struct ib_event ev;
-
-       rcu_read_lock();
-
-       /* Deal only with RC/UC qps that use the given SL. */
-       for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) {
-               for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp;
-                       qp = rcu_dereference(qp->next)) {
-                       if (qp->port_num == ppd->port &&
-                           (qp->ibqp.qp_type == IB_QPT_UC ||
-                            qp->ibqp.qp_type == IB_QPT_RC) &&
-                           qp->remote_ah_attr.sl == sl &&
-                           (ib_rvt_state_ops[qp->state] &
-                            RVT_POST_SEND_OK)) {
-                               spin_lock_irq(&qp->r_lock);
-                               spin_lock(&qp->s_hlock);
-                               spin_lock(&qp->s_lock);
-                               lastwqe = rvt_error_qp(qp,
-                                                      IB_WC_WR_FLUSH_ERR);
-                               spin_unlock(&qp->s_lock);
-                               spin_unlock(&qp->s_hlock);
-                               spin_unlock_irq(&qp->r_lock);
-                               if (lastwqe) {
-                                       ev.device = qp->ibqp.device;
-                                       ev.element.qp = &qp->ibqp;
-                                       ev.event =
-                                               IB_EVENT_QP_LAST_WQE_REACHED;
-                                       qp->ibqp.event_handler(&ev,
-                                               qp->ibqp.qp_context);
-                               }
-                       }
-               }
-       }
-
-       rcu_read_unlock();
-}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
deleted file mode 100644 (file)
index e7bc8d6..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef _QP_H
-#define _QP_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/hash.h>
-#include <rdma/rdmavt_qp.h>
-#include "verbs.h"
-#include "sdma.h"
-
-extern unsigned int hfi1_qp_table_size;
-
-/*
- * free_ahg - clear ahg from QP
- */
-static inline void clear_ahg(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       priv->s_hdr->ahgcount = 0;
-       qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
-       if (priv->s_sde && qp->s_ahgidx >= 0)
-               sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
-       qp->s_ahgidx = -1;
-}
-
-/**
- * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 hfi1_compute_aeth(struct rvt_qp *qp);
-
-/**
- * hfi1_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: user data for libibverbs.so
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
-                            struct ib_qp_init_attr *init_attr,
-                            struct ib_udata *udata);
-/**
- * hfi1_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void hfi1_get_credit(struct rvt_qp *qp, u32 aeth);
-
-/**
- * hfi1_qp_wakeup - wake up on the indicated event
- * @qp: the QP
- * @flag: flag the qp on which the qp is stalled
- */
-void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
-
-struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
-struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
-
-struct qp_iter;
-
-/**
- * qp_iter_init - initialize the iterator for the qp hash list
- * @dev: the hfi1_ibdev
- */
-struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
-
-/**
- * qp_iter_next - Find the next qp in the hash list
- * @iter: the iterator for the qp hash list
- */
-int qp_iter_next(struct qp_iter *iter);
-
-/**
- * qp_iter_print - print the qp information to seq_file
- * @s: the seq_file to emit the qp information on
- * @iter: the iterator for the qp hash list
- */
-void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
-
-/**
- * qp_comm_est - handle trap with QP established
- * @qp: the QP
- */
-void qp_comm_est(struct rvt_qp *qp);
-
-void _hfi1_schedule_send(struct rvt_qp *qp);
-void hfi1_schedule_send(struct rvt_qp *qp);
-
-void hfi1_migrate_qp(struct rvt_qp *qp);
-
-/*
- * Functions provided by hfi1 driver for rdmavt to use
- */
-void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                   gfp_t gfp);
-void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
-unsigned free_all_qps(struct rvt_dev_info *rdi);
-void notify_qp_reset(struct rvt_qp *qp);
-int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-                      struct ib_qp_attr *attr);
-void flush_qp_waiters(struct rvt_qp *qp);
-void notify_error_qp(struct rvt_qp *qp);
-void stop_send_queue(struct rvt_qp *qp);
-void quiesce_qp(struct rvt_qp *qp);
-u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
-int mtu_to_path_mtu(u32 mtu);
-void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
-#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
deleted file mode 100644 (file)
index 2441669..0000000
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
- * in twsi.c
- */
-#define I2C_MAX_RETRY 4
-
-/*
- * Raw i2c write.  No set-up or lock checking.
- */
-static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                      int offset, void *bp, int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt;
-       u8 *buff = bp;
-
-       cnt = 0;
-       while (cnt < len) {
-               int wlen = len - cnt;
-
-               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
-                                      buff + cnt, wlen);
-               if (ret) {
-                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
-                       return -EIO;
-               }
-               offset += wlen;
-               cnt += wlen;
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return cnt;
-}
-
-/*
- * Caller must hold the i2c chain resource.
- */
-int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
-             void *bp, int len)
-{
-       int ret;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
-}
-
-/*
- * Raw i2c read.  No set-up or lock checking.
- */
-static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                     int offset, void *bp, int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt, pass = 0;
-       int orig_offset = offset;
-
-       cnt = 0;
-       while (cnt < len) {
-               int rlen = len - cnt;
-
-               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
-                                      bp + cnt, rlen);
-               /* Some QSFP's fail first try. Retry as experiment */
-               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
-                       continue;
-               if (ret) {
-                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
-                       ret = -EIO;
-                       goto exit;
-               }
-               offset += rlen;
-               cnt += rlen;
-       }
-
-       ret = cnt;
-
-exit:
-       if (ret < 0) {
-               hfi1_dev_porterr(dd, ppd->port,
-                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
-                                target, i2c_addr, orig_offset, len);
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return ret;
-}
-
-/*
- * Caller must hold the i2c chain resource.
- */
-int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
-            void *bp, int len)
-{
-       int ret;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
-}
-
-/*
- * Write page n, offset m of QSFP memory as defined by SFF 8636
- * by writing @addr = ((256 * n) + m)
- *
- * Caller must hold the i2c chain resource.
- */
-int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-              int len)
-{
-       int count = 0;
-       int offset;
-       int nwrite;
-       int ret;
-       u8 page;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       while (count < len) {
-               /*
-                * Set the qsfp page based on a zero-based address
-                * and a page size of QSFP_PAGESIZE bytes.
-                */
-               page = (u8)(addr / QSFP_PAGESIZE);
-
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
-                       hfi1_dev_porterr(ppd->dd, ppd->port,
-                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
-                                        target, ret);
-                       ret = -EIO;
-                       break;
-               }
-
-               offset = addr % QSFP_PAGESIZE;
-               nwrite = len - count;
-               /* truncate write to boundary if crossing boundary */
-               if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
-                       nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
-
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 offset, bp + count, nwrite);
-               if (ret <= 0)   /* stop on error or nothing written */
-                       break;
-
-               count += ret;
-               addr += ret;
-       }
-
-       if (ret < 0)
-               return ret;
-       return count;
-}
-
-/*
- * Perform a stand-alone single QSFP write.  Acquire the resource, do the
- * read, then release the resource.
- */
-int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                  int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 resource = qsfp_resource(dd);
-       int ret;
-
-       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
-       if (ret)
-               return ret;
-       ret = qsfp_write(ppd, target, addr, bp, len);
-       release_chip_resource(dd, resource);
-
-       return ret;
-}
-
-/*
- * Access page n, offset m of QSFP memory as defined by SFF 8636
- * by reading @addr = ((256 * n) + m)
- *
- * Caller must hold the i2c chain resource.
- */
-int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-             int len)
-{
-       int count = 0;
-       int offset;
-       int nread;
-       int ret;
-       u8 page;
-
-       if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
-               return -EACCES;
-
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
-       while (count < len) {
-               /*
-                * Set the qsfp page based on a zero-based address
-                * and a page size of QSFP_PAGESIZE bytes.
-                */
-               page = (u8)(addr / QSFP_PAGESIZE);
-               ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                 QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
-                       hfi1_dev_porterr(ppd->dd, ppd->port,
-                                        "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
-                                        target, ret);
-                       ret = -EIO;
-                       break;
-               }
-
-               offset = addr % QSFP_PAGESIZE;
-               nread = len - count;
-               /* truncate read to boundary if crossing boundary */
-               if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
-                       nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
-
-               /* QSFPs require a 5-10msec delay after write operations */
-               mdelay(5);
-               ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
-                                offset, bp + count, nread);
-               if (ret <= 0)   /* stop on error or nothing read */
-                       break;
-
-               count += ret;
-               addr += ret;
-       }
-
-       if (ret < 0)
-               return ret;
-       return count;
-}
-
-/*
- * Perform a stand-alone single QSFP read.  Acquire the resource, do the
- * read, then release the resource.
- */
-int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                 int len)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u32 resource = qsfp_resource(dd);
-       int ret;
-
-       ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
-       if (ret)
-               return ret;
-       ret = qsfp_read(ppd, target, addr, bp, len);
-       release_chip_resource(dd, resource);
-
-       return ret;
-}
-
-/*
- * This function caches the QSFP memory range in 128 byte chunks.
- * As an example, the next byte after address 255 is byte 128 from
- * upper page 01H (if existing) rather than byte 0 from lower page 00H.
- * Access page n, offset m of QSFP memory as defined by SFF 8636
- * in the cache by reading byte ((128 * n) + m)
- * The calls to qsfp_{read,write} in this function correctly handle the
- * address map difference between this mapping and the mapping implemented
- * by those functions
- *
- * The caller must be holding the QSFP i2c chain resource.
- */
-int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
-{
-       u32 target = ppd->dd->hfi1_id;
-       int ret;
-       unsigned long flags;
-       u8 *cache = &cp->cache[0];
-
-       /* ensure sane contents on invalid reads, for cable swaps */
-       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
-       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-       ppd->qsfp_info.cache_valid = 0;
-       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-
-       if (!qsfp_mod_present(ppd)) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
-       if (ret != QSFP_PAGESIZE) {
-               dd_dev_info(ppd->dd,
-                           "%s: Page 0 read failed, expected %d, got %d\n",
-                           __func__, QSFP_PAGESIZE, ret);
-               goto bail;
-       }
-
-       /* Is paging enabled? */
-       if (!(cache[2] & 4)) {
-               /* Paging enabled, page 03 required */
-               if ((cache[195] & 0xC0) == 0xC0) {
-                       /* all */
-                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else if ((cache[195] & 0x80) == 0x80) {
-                       /* only page 2 and 3 */
-                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else if ((cache[195] & 0x40) == 0x40) {
-                       /* only page 1 and 3 */
-                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               } else {
-                       /* only page 3 */
-                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
-                       if (ret <= 0 || ret != 128) {
-                               dd_dev_info(ppd->dd, "%s failed\n", __func__);
-                               goto bail;
-                       }
-               }
-       }
-
-       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
-       ppd->qsfp_info.cache_valid = 1;
-       ppd->qsfp_info.cache_refresh_required = 0;
-       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
-
-       return 0;
-
-bail:
-       memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
-       return ret;
-}
-
-const char * const hfi1_qsfp_devtech[16] = {
-       "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
-       "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
-       "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
-       "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
-};
-
-#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
-#define QSFP_DEFAULT_HDR_CNT 224
-
-#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
-#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
-/* For use with QSFP_HIGH_PWR macro */
-#define QSFP_HIGH_PWR_UNUSED   0 /* Bits [1:0] = 00 implies low power module */
-
-/*
- * Takes power class byte [Page 00 Byte 129] in SFF 8636
- * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
- */
-int get_qsfp_power_class(u8 power_byte)
-{
-       if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
-               /* power classes count from 1, their bit encodings from 0 */
-               return (QSFP_PWR(power_byte) + 1);
-       /*
-        * 00 in the high power classes stands for unused, bringing
-        * balance to the off-by-1 offset above, we add 4 here to
-        * account for the difference between the low and high power
-        * groups
-        */
-       return (QSFP_HIGH_PWR(power_byte) + 4);
-}
-
-int qsfp_mod_present(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_devdata *dd = ppd->dd;
-       u64 reg;
-
-       reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-       return !(reg & QSFP_HFI0_MODPRST_N);
-}
-
-/*
- * This function maps QSFP memory addresses in 128 byte chunks in the following
- * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
- * spec
- * For addr 000-127, lower page 00h
- * For addr 128-255, upper page 00h
- * For addr 256-383, upper page 01h
- * For addr 384-511, upper page 02h
- * For addr 512-639, upper page 03h
- *
- * For addresses beyond this range, it returns the invalid range of data buffer
- * set to 0.
- * For upper pages that are optional, if they are not valid, returns the
- * particular range of bytes in the data buffer set to 0.
- */
-int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
-                  u8 *data)
-{
-       struct hfi1_pportdata *ppd;
-       u32 excess_len = 0;
-       int ret = 0;
-
-       if (port_num > dd->num_pports || port_num < 1) {
-               dd_dev_info(dd, "%s: Invalid port number %d\n",
-                           __func__, port_num);
-               ret = -EINVAL;
-               goto set_zeroes;
-       }
-
-       ppd = dd->pport + (port_num - 1);
-       if (!qsfp_mod_present(ppd)) {
-               ret = -ENODEV;
-               goto set_zeroes;
-       }
-
-       if (!ppd->qsfp_info.cache_valid) {
-               ret = -EINVAL;
-               goto set_zeroes;
-       }
-
-       if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
-               ret = -ERANGE;
-               goto set_zeroes;
-       }
-
-       if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
-               excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
-               memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
-               data += (len - excess_len);
-               goto set_zeroes;
-       }
-
-       memcpy(data, &ppd->qsfp_info.cache[addr], len);
-       return 0;
-
-set_zeroes:
-       memset(data, 0, excess_len);
-       return ret;
-}
-
-static const char *pwr_codes[8] = {"N/AW",
-                                 "1.5W",
-                                 "2.0W",
-                                 "2.5W",
-                                 "3.5W",
-                                 "4.0W",
-                                 "4.5W",
-                                 "5.0W"
-                                };
-
-int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
-{
-       u8 *cache = &ppd->qsfp_info.cache[0];
-       u8 bin_buff[QSFP_DUMP_CHUNK];
-       char lenstr[6];
-       int sofar;
-       int bidx = 0;
-       u8 *atten = &cache[QSFP_ATTEN_OFFS];
-       u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
-       u8 power_byte = 0;
-
-       sofar = 0;
-       lenstr[0] = ' ';
-       lenstr[1] = '\0';
-
-       if (ppd->qsfp_info.cache_valid) {
-               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
-                       sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
-
-               power_byte = cache[QSFP_MOD_PWR_OFFS];
-               sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
-                               pwr_codes[get_qsfp_power_class(power_byte)]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
-                               lenstr,
-                       hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
-                                  QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
-                                  QSFP_OUI(vendor_oui));
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
-                                  QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
-                                  QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
-
-               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
-                       sofar += scnprintf(buf + sofar, len - sofar,
-                               "Atten:%d, %d\n",
-                               QSFP_ATTEN_SDR(atten),
-                               QSFP_ATTEN_DDR(atten));
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
-                                  QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
-                                  QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
-
-               sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
-                                  QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
-
-               while (bidx < QSFP_DEFAULT_HDR_CNT) {
-                       int iidx;
-
-                       memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
-                       for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
-                               sofar += scnprintf(buf + sofar, len - sofar,
-                                       " %02X", bin_buff[iidx]);
-                       }
-                       sofar += scnprintf(buf + sofar, len - sofar, "\n");
-                       bidx += QSFP_DUMP_CHUNK;
-               }
-       }
-       return sofar;
-}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
deleted file mode 100644 (file)
index dadc66c..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-/* QSFP support common definitions, for hfi driver */
-
-#define QSFP_DEV 0xA0
-#define QSFP_PWR_LAG_MSEC 2000
-#define QSFP_MODPRS_LAG_MSEC 20
-/* 128 byte pages, per SFF 8636 rev 2.4 */
-#define QSFP_MAX_NUM_PAGES     5
-
-/*
- * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
- * _N means asserted low
- */
-#define QSFP_HFI0_I2CCLK    BIT(0)
-#define QSFP_HFI0_I2CDAT    BIT(1)
-#define QSFP_HFI0_RESET_N   BIT(2)
-#define QSFP_HFI0_INT_N            BIT(3)
-#define QSFP_HFI0_MODPRST_N BIT(4)
-
-/* QSFP is paged at 256 bytes */
-#define QSFP_PAGESIZE 256
-/* Reads/writes cannot cross 128 byte boundaries */
-#define QSFP_RW_BOUNDARY 128
-
-/* number of bytes in i2c offset for QSFP devices */
-#define __QSFP_OFFSET_SIZE 1                           /* num address bytes */
-#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8)     /* shifted value */
-
-/* Defined fields that Intel requires of qualified cables */
-/* Byte 0 is Identifier, not checked */
-/* Byte 1 is reserved "status MSB" */
-#define QSFP_TX_CTRL_BYTE_OFFS 86
-#define QSFP_PWR_CTRL_BYTE_OFFS 93
-#define QSFP_CDR_CTRL_BYTE_OFFS 98
-
-#define QSFP_PAGE_SELECT_BYTE_OFFS 127
-/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
-#define QSFP_MOD_ID_OFFS 128
-/*
- * Byte 129 is "Extended Identifier".
- * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
- * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
- */
-#define QSFP_MOD_PWR_OFFS 129
-/* Byte 130 is Connector type. Not Intel req'd */
-/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
-/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
-/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
-#define QSFP_NOM_BIT_RATE_100_OFFS 140
-/* Byte 141 is Extended Rate Select. Not Intel req'd */
-/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
-/* Byte 146 is length for Copper. Units of 1 meter */
-#define QSFP_MOD_LEN_OFFS 146
-/*
- * Byte 147 is Device technology. D0..3 not Intel req'd
- * D4..7 select from 15 choices, translated by table:
- */
-#define QSFP_MOD_TECH_OFFS 147
-extern const char *const hfi1_qsfp_devtech[16];
-/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
-#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
-/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
-#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
-/* Attenuation should be valid for copper other than full/near Eq */
-#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
-/* Length is only valid if technology is "copper" */
-#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
-#define QSFP_TECH_1490 9
-
-#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
-                       oui[2])
-#define QSFP_OUI_AMPHENOL 0x415048
-#define QSFP_OUI_FINISAR  0x009065
-#define QSFP_OUI_GORE     0x002177
-
-/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
-#define QSFP_VEND_OFFS 148
-#define QSFP_VEND_LEN 16
-/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
-#define QSFP_IBXCV_OFFS 164
-/* Bytes 165..167 are Vendor OUI number */
-#define QSFP_VOUI_OFFS 165
-#define QSFP_VOUI_LEN 3
-/* Bytes 168..183 are Vendor Part Number, string */
-#define QSFP_PN_OFFS 168
-#define QSFP_PN_LEN 16
-/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
-#define QSFP_REV_OFFS 184
-#define QSFP_REV_LEN 2
-/*
- * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
- *  If copper, they are attenuation in dB:
- * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
- */
-#define QSFP_ATTEN_OFFS 186
-#define QSFP_ATTEN_LEN 2
-/*
- * Bytes 188,189 are Wavelength tolerance, if optical
- * If copper, they are attenuation in dB:
- * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
- */
-#define QSFP_CU_ATTEN_7G_OFFS 188
-#define QSFP_CU_ATTEN_12G_OFFS 189
-/* Byte 190 is Max Case Temp. Not Intel req'd */
-/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
-#define QSFP_CC_OFFS 191
-#define QSFP_EQ_INFO_OFFS 193
-#define QSFP_CDR_INFO_OFFS 194
-/* Bytes 196..211 are Serial Number, String */
-#define QSFP_SN_OFFS 196
-#define QSFP_SN_LEN 16
-/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
-#define QSFP_DATE_OFFS 212
-#define QSFP_DATE_LEN 6
-/* Bytes 218,219 are optional lot-code, string */
-#define QSFP_LOT_OFFS 218
-#define QSFP_LOT_LEN 2
-/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
-/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
-#define QSFP_NOM_BIT_RATE_250_OFFS 222
-/* Byte 223 is LSB of sum of bytes 192..222 */
-#define QSFP_CC_EXT_OFFS 223
-
-/*
- * Interrupt flag masks
- */
-#define QSFP_DATA_NOT_READY            0x01
-
-#define QSFP_HIGH_TEMP_ALARM           0x80
-#define QSFP_LOW_TEMP_ALARM            0x40
-#define QSFP_HIGH_TEMP_WARNING         0x20
-#define QSFP_LOW_TEMP_WARNING          0x10
-
-#define QSFP_HIGH_VCC_ALARM            0x80
-#define QSFP_LOW_VCC_ALARM             0x40
-#define QSFP_HIGH_VCC_WARNING          0x20
-#define QSFP_LOW_VCC_WARNING           0x10
-
-#define QSFP_HIGH_POWER_ALARM          0x88
-#define QSFP_LOW_POWER_ALARM           0x44
-#define QSFP_HIGH_POWER_WARNING                0x22
-#define QSFP_LOW_POWER_WARNING         0x11
-
-#define QSFP_HIGH_BIAS_ALARM           0x88
-#define QSFP_LOW_BIAS_ALARM            0x44
-#define QSFP_HIGH_BIAS_WARNING         0x22
-#define QSFP_LOW_BIAS_WARNING          0x11
-
-#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
-#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
-
-/*
- * struct qsfp_data encapsulates state of QSFP device for one port.
- * it will be part of port-specific data if a board supports QSFP.
- *
- * Since multiple board-types use QSFP, and their pport_data structs
- * differ (in the chip-specific section), we need a pointer to its head.
- *
- * Avoiding premature optimization, we will have one work_struct per port,
- * and let the qsfp_lock arbitrate access to common resources.
- *
- */
-struct qsfp_data {
-       /* Helps to find our way */
-       struct hfi1_pportdata *ppd;
-       struct work_struct qsfp_work;
-       u8 cache[QSFP_MAX_NUM_PAGES * 128];
-       /* protect qsfp data */
-       spinlock_t qsfp_lock;
-       u8 check_interrupt_flags;
-       u8 reset_needed;
-       u8 limiting_active;
-       u8 cache_valid;
-       u8 cache_refresh_required;
-};
-
-int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
-                      struct qsfp_data *cp);
-int get_qsfp_power_class(u8 power_byte);
-int qsfp_mod_present(struct hfi1_pportdata *ppd);
-int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
-                  u32 len, u8 *data);
-
-int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-             int offset, void *bp, int len);
-int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-            int offset, void *bp, int len);
-int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-              int len);
-int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-             int len);
-int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                  int len);
-int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
-                 int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
deleted file mode 100644 (file)
index 792f15e..0000000
+++ /dev/null
@@ -1,2580 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/io.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-
-#include "hfi.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_RC_##x
-
-/**
- * hfi1_add_retry_timer - add/start a retry timer
- * @qp - the QP
- *
- * add a retry timer on the QP
- */
-static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
-
-       qp->s_flags |= RVT_S_TIMER;
-       /* 4.096 usec. * (1 << qp->timeout) */
-       qp->s_timer.expires = jiffies + qp->timeout_jiffies +
-                             rdi->busy_jiffies;
-       add_timer(&qp->s_timer);
-}
-
-/**
- * hfi1_add_rnr_timer - add/start an rnr timer
- * @qp - the QP
- * @to - timeout in usecs
- *
- * add an rnr timer on the QP
- */
-void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       qp->s_flags |= RVT_S_WAIT_RNR;
-       qp->s_timer.expires = jiffies + usecs_to_jiffies(to);
-       add_timer(&priv->s_rnr_timer);
-}
-
-/**
- * hfi1_mod_retry_timer - mod a retry timer
- * @qp - the QP
- *
- * Modify a potentially already running retry
- * timer
- */
-static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
-
-       qp->s_flags |= RVT_S_TIMER;
-       /* 4.096 usec. * (1 << qp->timeout) */
-       mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
-                 rdi->busy_jiffies);
-}
-
-/**
- * hfi1_stop_retry_timer - stop a retry timer
- * @qp - the QP
- *
- * stop a retry timer and return if the timer
- * had been pending.
- */
-static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
-{
-       int rval = 0;
-
-       /* Remove QP from retry */
-       if (qp->s_flags & RVT_S_TIMER) {
-               qp->s_flags &= ~RVT_S_TIMER;
-               rval = del_timer(&qp->s_timer);
-       }
-       return rval;
-}
-
-/**
- * hfi1_stop_rc_timers - stop all timers
- * @qp - the QP
- *
- * stop any pending timers
- */
-void hfi1_stop_rc_timers(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Remove QP from all timers */
-       if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
-               qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
-               del_timer(&qp->s_timer);
-               del_timer(&priv->s_rnr_timer);
-       }
-}
-
-/**
- * hfi1_stop_rnr_timer - stop an rnr timer
- * @qp - the QP
- *
- * stop an rnr timer and return if the timer
- * had been pending.
- */
-static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
-{
-       int rval = 0;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Remove QP from rnr timer */
-       if (qp->s_flags & RVT_S_WAIT_RNR) {
-               qp->s_flags &= ~RVT_S_WAIT_RNR;
-               rval = del_timer(&priv->s_rnr_timer);
-       }
-       return rval;
-}
-
-/**
- * hfi1_del_timers_sync - wait for any timeout routines to exit
- * @qp - the QP
- */
-void hfi1_del_timers_sync(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       del_timer_sync(&qp->s_timer);
-       del_timer_sync(&priv->s_rnr_timer);
-}
-
-/* only opcode mask for adaptive pio */
-const u32 rc_only_opcode =
-       BIT(OP(SEND_ONLY) & 0x1f) |
-       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
-       BIT(OP(ACKNOWLEDGE & 0x1f)) |
-       BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
-       BIT(OP(COMPARE_SWAP & 0x1f)) |
-       BIT(OP(FETCH_ADD & 0x1f));
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
-                      u32 psn, u32 pmtu)
-{
-       u32 len;
-
-       len = delta_psn(psn, wqe->psn) * pmtu;
-       ss->sge = wqe->sg_list[0];
-       ss->sg_list = wqe->sg_list + 1;
-       ss->num_sge = wqe->wr.num_sge;
-       ss->total_len = wqe->length;
-       hfi1_skip_sge(ss, len, 0);
-       return wqe->length - len;
-}
-
-/**
- * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
- * @dev: the device for this QP
- * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @ps: the xmit packet state
- *
- * Return 1 if constructed; otherwise, return 0.
- * Note that we are in the responder's side of the QP context.
- * Note the QP s_lock must be held.
- */
-static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
-                      struct hfi1_other_headers *ohdr,
-                      struct hfi1_pkt_state *ps)
-{
-       struct rvt_ack_entry *e;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-       int middle = 0;
-       u32 pmtu = qp->pmtu;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       /* Don't send an ACK if we aren't supposed to. */
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-               goto bail;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-
-       switch (qp->s_ack_state) {
-       case OP(RDMA_READ_RESPONSE_LAST):
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               /* FALLTHROUGH */
-       case OP(ATOMIC_ACKNOWLEDGE):
-               /*
-                * We can increment the tail pointer now that the last
-                * response has been sent instead of only being
-                * constructed.
-                */
-               if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
-                       qp->s_tail_ack_queue = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_ONLY):
-       case OP(ACKNOWLEDGE):
-               /* Check for no next entry in the queue. */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
-                       if (qp->s_flags & RVT_S_ACK_PENDING)
-                               goto normal;
-                       goto bail;
-               }
-
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST)) {
-                       /*
-                        * If a RDMA read response is being resent and
-                        * we haven't seen the duplicate request yet,
-                        * then stop sending the remaining responses the
-                        * responder has seen until the requester re-sends it.
-                        */
-                       len = e->rdma_sge.sge_length;
-                       if (len && !e->rdma_sge.mr) {
-                               qp->s_tail_ack_queue = qp->r_head_ack_queue;
-                               goto bail;
-                       }
-                       /* Copy SGE state in case we need to resend */
-                       ps->s_txreq->mr = e->rdma_sge.mr;
-                       if (ps->s_txreq->mr)
-                               rvt_get_mr(ps->s_txreq->mr);
-                       qp->s_ack_rdma_sge.sge = e->rdma_sge;
-                       qp->s_ack_rdma_sge.num_sge = 1;
-                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
-                       if (len > pmtu) {
-                               len = pmtu;
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
-                       } else {
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-                               e->sent = 1;
-                       }
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_rdma_psn = e->psn;
-                       bth2 = mask_psn(qp->s_ack_rdma_psn++);
-               } else {
-                       /* COMPARE_SWAP or FETCH_ADD */
-                       qp->s_cur_sge = NULL;
-                       len = 0;
-                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
-                       ohdr->u.at.aeth = hfi1_compute_aeth(qp);
-                       ohdr->u.at.atomic_ack_eth[0] =
-                               cpu_to_be32(e->atomic_data >> 32);
-                       ohdr->u.at.atomic_ack_eth[1] =
-                               cpu_to_be32(e->atomic_data);
-                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
-                       bth2 = mask_psn(e->psn);
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               qp->s_cur_sge = &qp->s_ack_rdma_sge;
-               ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
-               if (ps->s_txreq->mr)
-                       rvt_get_mr(ps->s_txreq->mr);
-               len = qp->s_ack_rdma_sge.sge.sge_length;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-               } else {
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-                       e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               bth2 = mask_psn(qp->s_ack_rdma_psn++);
-               break;
-
-       default:
-normal:
-               /*
-                * Send a regular ACK.
-                * Set the s_ack_state so we wait until after sending
-                * the ACK before setting s_ack_state to ACKNOWLEDGE
-                * (see above).
-                */
-               qp->s_ack_state = OP(SEND_ONLY);
-               qp->s_flags &= ~RVT_S_ACK_PENDING;
-               qp->s_cur_sge = NULL;
-               if (qp->s_nak_state)
-                       ohdr->u.aeth =
-                               cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
-                                           (qp->s_nak_state <<
-                                            HFI1_AETH_CREDIT_SHIFT));
-               else
-                       ohdr->u.aeth = hfi1_compute_aeth(qp);
-               hwords++;
-               len = 0;
-               bth0 = OP(ACKNOWLEDGE) << 24;
-               bth2 = mask_psn(qp->s_ack_psn);
-       }
-       qp->s_rdma_ack_cnt++;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-bail:
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-       /*
-        * Ensure s_rdma_ack_cnt changes are committed prior to resetting
-        * RVT_S_RESP_PENDING
-        */
-       smp_wmb();
-       qp->s_flags &= ~(RVT_S_RESP_PENDING
-                               | RVT_S_ACK_PENDING
-                               | RVT_S_AHG_VALID);
-       return 0;
-}
-
-/**
- * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
- * @qp: a pointer to the QP
- *
- * Assumes s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
-       struct hfi1_other_headers *ohdr;
-       struct rvt_sge_state *ss;
-       struct rvt_swqe *wqe;
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       u32 hwords = 5;
-       u32 len;
-       u32 bth0 = 0;
-       u32 bth2;
-       u32 pmtu = qp->pmtu;
-       char newreq;
-       int middle = 0;
-       int delta;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-
-       /* Sending responses has higher priority over sending requests. */
-       if ((qp->s_flags & RVT_S_RESP_PENDING) &&
-           make_rc_ack(dev, qp, ohdr, ps))
-               return 1;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               clear_ahg(qp);
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
-                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
-               /* will get called again */
-               goto done_free_tx;
-       }
-
-       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
-               goto bail;
-
-       if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
-               if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
-                       qp->s_flags |= RVT_S_WAIT_PSN;
-                       goto bail;
-               }
-               qp->s_sending_psn = qp->s_psn;
-               qp->s_sending_hpsn = qp->s_psn - 1;
-       }
-
-       /* Send a request. */
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       switch (qp->s_state) {
-       default:
-               if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /*
-                * Resend an old request or start a new one.
-                *
-                * We keep track of the current SWQE so that
-                * we don't reset the "furthest progress" state
-                * if we need to back up.
-                */
-               newreq = 0;
-               if (qp->s_cur == qp->s_tail) {
-                       /* Check if send work queue is empty. */
-                       if (qp->s_tail == qp->s_head) {
-                               clear_ahg(qp);
-                               goto bail;
-                       }
-                       /*
-                        * If a fence is requested, wait for previous
-                        * RDMA read and atomic operations to finish.
-                        */
-                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-                           qp->s_num_rd_atomic) {
-                               qp->s_flags |= RVT_S_WAIT_FENCE;
-                               goto bail;
-                       }
-                       newreq = 1;
-                       qp->s_psn = wqe->psn;
-               }
-               /*
-                * Note that we have to be careful not to modify the
-                * original work request since we may need to resend
-                * it.
-                */
-               len = wqe->length;
-               ss = &qp->s_sge;
-               bth2 = mask_psn(qp->s_psn);
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       /* If no credit, return. */
-                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND) {
-                               qp->s_state = OP(SEND_ONLY);
-                       } else {
-                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-                       if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                               qp->s_lsn++;
-                       /* FALLTHROUGH */
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       /* If no credit, return. */
-                       if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
-                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= IB_BTH_SOLICITED;
-                       }
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_READ:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                                       qp->s_lsn++;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       qp->s_state = OP(RDMA_READ_REQUEST);
-                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_ATOMIC_CMP_AND_SWP:
-               case IB_WR_ATOMIC_FETCH_AND_ADD:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= RVT_S_WAIT_RDMAR;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
-                                       qp->s_lsn++;
-                       }
-                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-                               qp->s_state = OP(COMPARE_SWAP);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.swap);
-                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                       } else {
-                               qp->s_state = OP(FETCH_ADD);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->atomic_wr.compare_add);
-                               ohdr->u.atomic_eth.compare_data = 0;
-                       }
-                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr >> 32);
-                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
-                               wqe->atomic_wr.remote_addr);
-                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
-                               wqe->atomic_wr.rkey);
-                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       bth2 |= IB_BTH_REQ_ACK;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_sge.total_len = wqe->length;
-               qp->s_len = wqe->length;
-               if (newreq) {
-                       qp->s_tail++;
-                       if (qp->s_tail >= qp->s_size)
-                               qp->s_tail = 0;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       qp->s_psn = wqe->lpsn + 1;
-               else
-                       qp->s_psn++;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
-                * thread to indicate a SEND needs to be restarted from an
-                * earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-               /* FALLTHROUGH */
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               bth2 = mask_psn(qp->s_psn++);
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND) {
-                       qp->s_state = OP(SEND_LAST);
-               } else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= IB_BTH_SOLICITED;
-               bth2 |= IB_BTH_REQ_ACK;
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_LAST is used by the ACK processing
-                * thread to indicate a RDMA write needs to be restarted from
-                * an earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               bth2 = mask_psn(qp->s_psn++);
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               } else {
-                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-               }
-               bth2 |= IB_BTH_REQ_ACK;
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /*
-                * qp->s_state is normally set to the opcode of the
-                * last packet constructed for new requests and therefore
-                * is never set to RDMA read response.
-                * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
-                * thread to indicate a RDMA read needs to be restarted from
-                * an earlier PSN without interfering with the sending thread.
-                * See restart_rc().
-                */
-               len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
-               ohdr->u.rc.reth.vaddr =
-                       cpu_to_be64(wqe->rdma_wr.remote_addr + len);
-               ohdr->u.rc.reth.rkey =
-                       cpu_to_be32(wqe->rdma_wr.rkey);
-               ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
-               qp->s_state = OP(RDMA_READ_REQUEST);
-               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-               bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
-               qp->s_psn = wqe->lpsn + 1;
-               ss = NULL;
-               len = 0;
-               qp->s_cur++;
-               if (qp->s_cur == qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_sending_hpsn = bth2;
-       delta = delta_psn(bth2, wqe->psn);
-       if (delta && delta % HFI1_PSN_CREDIT == 0)
-               bth2 |= IB_BTH_REQ_ACK;
-       if (qp->s_flags & RVT_S_SEND_ONE) {
-               qp->s_flags &= ~RVT_S_SEND_ONE;
-               qp->s_flags |= RVT_S_WAIT_ACK;
-               bth2 |= IB_BTH_REQ_ACK;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_sge = ss;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(
-               qp,
-               ohdr,
-               bth0 | (qp->s_state << 24),
-               bth2,
-               middle,
-               ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/**
- * hfi1_send_rc_ack - Construct an ACK packet and send it
- * @qp: a pointer to the QP
- *
- * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
- * Note that RDMA reads and atomics are handled in the
- * send side QP state and tasklet.
- */
-void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
-                     int is_fecn)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       u64 pbc, pbc_flags = 0;
-       u16 lrh0;
-       u16 sc5;
-       u32 bth0;
-       u32 hwords;
-       u32 vl, plen;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       struct hfi1_ib_header hdr;
-       struct hfi1_other_headers *ohdr;
-       unsigned long flags;
-
-       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-       if (qp->s_flags & RVT_S_RESP_PENDING)
-               goto queue_ack;
-
-       /* Ensure s_rdma_ack_cnt changes are committed */
-       smp_read_barrier_depends();
-       if (qp->s_rdma_ack_cnt)
-               goto queue_ack;
-
-       /* Construct the header */
-       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
-       hwords = 6;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
-                                      &qp->remote_ah_attr.grh, hwords, 0);
-               ohdr = &hdr.u.l.oth;
-               lrh0 = HFI1_LRH_GRH;
-       } else {
-               ohdr = &hdr.u.oth;
-               lrh0 = HFI1_LRH_BTH;
-       }
-       /* read pkey_index w/o lock (its atomic) */
-       bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
-       if (qp->s_mig_state == IB_MIG_MIGRATED)
-               bth0 |= IB_BTH_MIG_REQ;
-       if (qp->r_nak_state)
-               ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
-                                           (qp->r_nak_state <<
-                                            HFI1_AETH_CREDIT_SHIFT));
-       else
-               ohdr->u.aeth = hfi1_compute_aeth(qp);
-       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-       pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
-       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
-       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
-
-       /* Don't try to send ACKs if the link isn't ACTIVE */
-       if (driver_lstate(ppd) != IB_PORT_ACTIVE)
-               return;
-
-       sc = rcd->sc;
-       plen = 2 /* PBC */ + hwords;
-       vl = sc_to_vlt(ppd->dd, sc5);
-       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-
-       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
-       if (!pbuf) {
-               /*
-                * We have no room to send at the moment.  Pass
-                * responsibility for sending the ACK to the send tasklet
-                * so that when enough buffer space becomes available,
-                * the ACK is sent ahead of other outgoing packets.
-                */
-               goto queue_ack;
-       }
-
-       trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
-
-       /* write the pbc and data */
-       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
-
-       return;
-
-queue_ack:
-       this_cpu_inc(*ibp->rvp.rc_qacks);
-       spin_lock_irqsave(&qp->s_lock, flags);
-       qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
-       qp->s_nak_state = qp->r_nak_state;
-       qp->s_ack_psn = qp->r_ack_psn;
-       if (is_fecn)
-               qp->s_flags |= RVT_S_ECN;
-
-       /* Schedule the send tasklet. */
-       hfi1_schedule_send(qp);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-}
-
-/**
- * reset_psn - reset the QP state to send starting from PSN
- * @qp: the QP
- * @psn: the packet sequence number to restart at
- *
- * This is called from hfi1_rc_rcv() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held.
- */
-static void reset_psn(struct rvt_qp *qp, u32 psn)
-{
-       u32 n = qp->s_acked;
-       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
-       u32 opcode;
-
-       qp->s_cur = n;
-
-       /*
-        * If we are starting the request from the beginning,
-        * let the normal send code handle initialization.
-        */
-       if (cmp_psn(psn, wqe->psn) <= 0) {
-               qp->s_state = OP(SEND_LAST);
-               goto done;
-       }
-
-       /* Find the work request opcode corresponding to the given PSN. */
-       opcode = wqe->wr.opcode;
-       for (;;) {
-               int diff;
-
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-               wqe = rvt_get_swqe_ptr(qp, n);
-               diff = cmp_psn(psn, wqe->psn);
-               if (diff < 0)
-                       break;
-               qp->s_cur = n;
-               /*
-                * If we are starting the request from the beginning,
-                * let the normal send code handle initialization.
-                */
-               if (diff == 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       goto done;
-               }
-               opcode = wqe->wr.opcode;
-       }
-
-       /*
-        * Set the state to restart in the middle of a request.
-        * Don't change the s_sge, s_cur_sge, or s_cur_size.
-        * See hfi1_make_rc_req().
-        */
-       switch (opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
-               break;
-
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
-               break;
-
-       case IB_WR_RDMA_READ:
-               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               break;
-
-       default:
-               /*
-                * This case shouldn't happen since its only
-                * one PSN per req.
-                */
-               qp->s_state = OP(SEND_LAST);
-       }
-done:
-       qp->s_psn = psn;
-       /*
-        * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
-        * asynchronously before the send tasklet can get scheduled.
-        * Doing it in hfi1_make_rc_req() is too late.
-        */
-       if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
-           (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
-               qp->s_flags |= RVT_S_WAIT_PSN;
-       qp->s_flags &= ~RVT_S_AHG_VALID;
-}
-
-/*
- * Back up requester to resend the last un-ACKed request.
- * The QP r_lock and s_lock should be held and interrupts disabled.
- */
-static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
-{
-       struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       struct hfi1_ibport *ibp;
-
-       if (qp->s_retry == 0) {
-               if (qp->s_mig_state == IB_MIG_ARMED) {
-                       hfi1_migrate_qp(qp);
-                       qp->s_retry = qp->s_retry_cnt;
-               } else if (qp->s_last == qp->s_acked) {
-                       hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-                       rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       return;
-               } else { /* need to handle delayed completion */
-                       return;
-               }
-       } else {
-               qp->s_retry--;
-       }
-
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-               ibp->rvp.n_rc_resends++;
-       else
-               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-       qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
-                        RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-                        RVT_S_WAIT_ACK);
-       if (wait)
-               qp->s_flags |= RVT_S_SEND_ONE;
-       reset_psn(qp, psn);
-}
-
-/*
- * This is called from s_timer for missing responses.
- */
-void hfi1_rc_timeout(unsigned long arg)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       struct hfi1_ibport *ibp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->r_lock, flags);
-       spin_lock(&qp->s_lock);
-       if (qp->s_flags & RVT_S_TIMER) {
-               ibp = to_iport(qp->ibqp.device, qp->port_num);
-               ibp->rvp.n_rc_timeouts++;
-               qp->s_flags &= ~RVT_S_TIMER;
-               del_timer(&qp->s_timer);
-               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
-               restart_rc(qp, qp->s_last_psn + 1, 1);
-               hfi1_schedule_send(qp);
-       }
-       spin_unlock(&qp->s_lock);
-       spin_unlock_irqrestore(&qp->r_lock, flags);
-}
-
-/*
- * This is called from s_timer for RNR timeouts.
- */
-void hfi1_rc_rnr_retry(unsigned long arg)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       hfi1_stop_rnr_timer(qp);
-       hfi1_schedule_send(qp);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-}
-
-/*
- * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
- */
-static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
-{
-       struct rvt_swqe *wqe;
-       u32 n = qp->s_last;
-
-       /* Find the work request corresponding to the given PSN. */
-       for (;;) {
-               wqe = rvt_get_swqe_ptr(qp, n);
-               if (cmp_psn(psn, wqe->lpsn) <= 0) {
-                       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                               qp->s_sending_psn = wqe->lpsn + 1;
-                       else
-                               qp->s_sending_psn = psn + 1;
-                       break;
-               }
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-       }
-}
-
-/*
- * This should be called with the QP s_lock held and interrupts disabled.
- */
-void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
-{
-       struct hfi1_other_headers *ohdr;
-       struct rvt_swqe *wqe;
-       struct ib_wc wc;
-       unsigned i;
-       u32 opcode;
-       u32 psn;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       /* Find out where the BTH is */
-       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else
-               ohdr = &hdr->u.l.oth;
-
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               WARN_ON(!qp->s_rdma_ack_cnt);
-               qp->s_rdma_ack_cnt--;
-               return;
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       reset_sending_psn(qp, psn);
-
-       /*
-        * Start timer after a packet requesting an ACK has been sent and
-        * there are still requests that haven't been acked.
-        */
-       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
-           !(qp->s_flags &
-               (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-               (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
-               hfi1_add_retry_timer(qp);
-
-       while (qp->s_last != qp->s_acked) {
-               u32 s_last;
-
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
-                   cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
-                       break;
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               for (i = 0; i < wqe->wr.num_sge; i++) {
-                       struct rvt_sge *sge = &wqe->sg_list[i];
-
-                       rvt_put_mr(sge->mr);
-               }
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof(wc));
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-               }
-       }
-       /*
-        * If we were waiting for sends to complete before re-sending,
-        * and they are now complete, restart sending.
-        */
-       trace_hfi1_rc_sendcomplete(qp, psn);
-       if (qp->s_flags & RVT_S_WAIT_PSN &&
-           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               qp->s_flags &= ~RVT_S_WAIT_PSN;
-               qp->s_sending_psn = qp->s_psn;
-               qp->s_sending_hpsn = qp->s_psn - 1;
-               hfi1_schedule_send(qp);
-       }
-}
-
-static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
-{
-       qp->s_last_psn = psn;
-}
-
-/*
- * Generate a SWQE completion.
- * This is similar to hfi1_send_complete but has to check to be sure
- * that the SGEs are not being referenced if the SWQE is being resent.
- */
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
-                                        struct rvt_swqe *wqe,
-                                        struct hfi1_ibport *ibp)
-{
-       struct ib_wc wc;
-       unsigned i;
-
-       /*
-        * Don't decrement refcount and don't generate a
-        * completion if the SWQE is being resent until the send
-        * is finished.
-        */
-       if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
-           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               u32 s_last;
-
-               for (i = 0; i < wqe->wr.num_sge; i++) {
-                       struct rvt_sge *sge = &wqe->sg_list[i];
-
-                       rvt_put_mr(sge->mr);
-               }
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof(wc));
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-               }
-       } else {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-               this_cpu_inc(*ibp->rvp.rc_delayed_comp);
-               /*
-                * If send progress not running attempt to progress
-                * SDMA queue.
-                */
-               if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
-                       struct sdma_engine *engine;
-                       u8 sc5;
-
-                       /* For now use sc to find engine */
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       engine = qp_to_sdma_engine(qp, sc5);
-                       sdma_engine_progress_schedule(engine);
-               }
-       }
-
-       qp->s_retry = qp->s_retry_cnt;
-       update_last_psn(qp, wqe->lpsn);
-
-       /*
-        * If we are completing a request which is in the process of
-        * being resent, we can stop re-sending it since we know the
-        * responder has already seen it.
-        */
-       if (qp->s_acked == qp->s_cur) {
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               qp->s_acked = qp->s_cur;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-               if (qp->s_acked != qp->s_tail) {
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = wqe->psn;
-               }
-       } else {
-               if (++qp->s_acked >= qp->s_size)
-                       qp->s_acked = 0;
-               if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
-                       qp->s_draining = 0;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       }
-       return wqe;
-}
-
-/**
- * do_rc_ack - process an incoming RC ACK
- * @qp: the QP the ACK came in on
- * @psn: the packet sequence number of the ACK
- * @opcode: the opcode of the request that resulted in the ACK
- *
- * This is called from rc_rcv_resp() to process an incoming RC ACK
- * for the given QP.
- * May be called at interrupt level, with the QP s_lock held.
- * Returns 1 if OK, 0 if current operation should be aborted (NAK).
- */
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
-                    u64 val, struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_ibport *ibp;
-       enum ib_wc_status status;
-       struct rvt_swqe *wqe;
-       int ret = 0;
-       u32 ack_psn;
-       int diff;
-       unsigned long to;
-
-       /*
-        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
-        * requests and implicitly NAK RDMA read and atomic requests issued
-        * before the NAK'ed request.  The MSN won't include the NAK'ed
-        * request but will include an ACK'ed request(s).
-        */
-       ack_psn = psn;
-       if (aeth >> 29)
-               ack_psn--;
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-
-       /*
-        * The MSN might be for a later WQE than the PSN indicates so
-        * only complete WQEs that the PSN finishes.
-        */
-       while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
-               /*
-                * RDMA_READ_RESPONSE_ONLY is a special case since
-                * we want to generate completion events for everything
-                * before the RDMA read, copy the data, then generate
-                * the completion for the read.
-                */
-               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
-                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
-                   diff == 0) {
-                       ret = 1;
-                       goto bail_stop;
-               }
-               /*
-                * If this request is a RDMA read or atomic, and the ACK is
-                * for a later operation, this ACK NAKs the RDMA read or
-                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
-                * can ACK a RDMA read and likewise for atomic ops.  Note
-                * that the NAK case can only happen if relaxed ordering is
-                * used and requests are sent after an RDMA read or atomic
-                * is sent but before the response is received.
-                */
-               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
-                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-                       /* Retry this request. */
-                       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-                               qp->r_flags |= RVT_R_RDMAR_SEQ;
-                               restart_rc(qp, qp->s_last_psn + 1, 0);
-                               if (list_empty(&qp->rspwait)) {
-                                       qp->r_flags |= RVT_R_RSP_SEND;
-                                       atomic_inc(&qp->refcount);
-                                       list_add_tail(&qp->rspwait,
-                                                     &rcd->qp_wait_list);
-                               }
-                       }
-                       /*
-                        * No need to process the ACK/NAK since we are
-                        * restarting an earlier request.
-                        */
-                       goto bail_stop;
-               }
-               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
-                       u64 *vaddr = wqe->sg_list[0].vaddr;
-                       *vaddr = val;
-               }
-               if (qp->s_num_rd_atomic &&
-                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
-                       qp->s_num_rd_atomic--;
-                       /* Restart sending task if fence is complete */
-                       if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
-                           !qp->s_num_rd_atomic) {
-                               qp->s_flags &= ~(RVT_S_WAIT_FENCE |
-                                                RVT_S_WAIT_ACK);
-                               hfi1_schedule_send(qp);
-                       } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
-                               qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
-                                                RVT_S_WAIT_ACK);
-                               hfi1_schedule_send(qp);
-                       }
-               }
-               wqe = do_rc_completion(qp, wqe, ibp);
-               if (qp->s_acked == qp->s_tail)
-                       break;
-       }
-
-       switch (aeth >> 29) {
-       case 0:         /* ACK */
-               this_cpu_inc(*ibp->rvp.rc_acks);
-               if (qp->s_acked != qp->s_tail) {
-                       /*
-                        * We are expecting more ACKs so
-                        * mod the retry timer.
-                        */
-                       hfi1_mod_retry_timer(qp);
-                       /*
-                        * We can stop re-sending the earlier packets and
-                        * continue with the next packet the receiver wants.
-                        */
-                       if (cmp_psn(qp->s_psn, psn) <= 0)
-                               reset_psn(qp, psn + 1);
-               } else {
-                       /* No more acks - kill all timers */
-                       hfi1_stop_rc_timers(qp);
-                       if (cmp_psn(qp->s_psn, psn) <= 0) {
-                               qp->s_state = OP(SEND_LAST);
-                               qp->s_psn = psn + 1;
-                       }
-               }
-               if (qp->s_flags & RVT_S_WAIT_ACK) {
-                       qp->s_flags &= ~RVT_S_WAIT_ACK;
-                       hfi1_schedule_send(qp);
-               }
-               hfi1_get_credit(qp, aeth);
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               qp->s_retry = qp->s_retry_cnt;
-               update_last_psn(qp, psn);
-               return 1;
-
-       case 1:         /* RNR NAK */
-               ibp->rvp.n_rnr_naks++;
-               if (qp->s_acked == qp->s_tail)
-                       goto bail_stop;
-               if (qp->s_flags & RVT_S_WAIT_RNR)
-                       goto bail_stop;
-               if (qp->s_rnr_retry == 0) {
-                       status = IB_WC_RNR_RETRY_EXC_ERR;
-                       goto class_b;
-               }
-               if (qp->s_rnr_retry_cnt < 7)
-                       qp->s_rnr_retry--;
-
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-
-               ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-               reset_psn(qp, psn);
-
-               qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
-               hfi1_stop_rc_timers(qp);
-               to =
-                       ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
-                                          HFI1_AETH_CREDIT_MASK];
-               hfi1_add_rnr_timer(qp, to);
-               return 0;
-
-       case 3:         /* NAK */
-               if (qp->s_acked == qp->s_tail)
-                       goto bail_stop;
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-               switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
-                       HFI1_AETH_CREDIT_MASK) {
-               case 0: /* PSN sequence error */
-                       ibp->rvp.n_seq_naks++;
-                       /*
-                        * Back up to the responder's expected PSN.
-                        * Note that we might get a NAK in the middle of an
-                        * RDMA READ response which terminates the RDMA
-                        * READ.
-                        */
-                       restart_rc(qp, psn, 0);
-                       hfi1_schedule_send(qp);
-                       break;
-
-               case 1: /* Invalid Request */
-                       status = IB_WC_REM_INV_REQ_ERR;
-                       ibp->rvp.n_other_naks++;
-                       goto class_b;
-
-               case 2: /* Remote Access Error */
-                       status = IB_WC_REM_ACCESS_ERR;
-                       ibp->rvp.n_other_naks++;
-                       goto class_b;
-
-               case 3: /* Remote Operation Error */
-                       status = IB_WC_REM_OP_ERR;
-                       ibp->rvp.n_other_naks++;
-class_b:
-                       if (qp->s_last == qp->s_acked) {
-                               hfi1_send_complete(qp, wqe, status);
-                               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       }
-                       break;
-
-               default:
-                       /* Ignore other reserved NAK error codes */
-                       goto reserved;
-               }
-               qp->s_retry = qp->s_retry_cnt;
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               goto bail_stop;
-
-       default:                /* 2: reserved */
-reserved:
-               /* Ignore reserved NAK codes. */
-               goto bail_stop;
-       }
-       /* cannot be reached  */
-bail_stop:
-       hfi1_stop_rc_timers(qp);
-       return ret;
-}
-
-/*
- * We have seen an out of sequence RDMA read middle or last packet.
- * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
- */
-static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
-                        struct hfi1_ctxtdata *rcd)
-{
-       struct rvt_swqe *wqe;
-
-       /* Remove QP from retry timer */
-       hfi1_stop_rc_timers(qp);
-
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-
-       while (cmp_psn(psn, wqe->lpsn) > 0) {
-               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-                       break;
-               wqe = do_rc_completion(qp, wqe, ibp);
-       }
-
-       ibp->rvp.n_rdma_seq++;
-       qp->r_flags |= RVT_R_RDMAR_SEQ;
-       restart_rc(qp, qp->s_last_psn + 1, 0);
-       if (list_empty(&qp->rspwait)) {
-               qp->r_flags |= RVT_R_RSP_SEND;
-               atomic_inc(&qp->refcount);
-               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-       }
-}
-
-/**
- * rc_rcv_resp - process an incoming RC response packet
- * @ibp: the port this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @hdrsize: the header length
- * @pmtu: the path MTU
- *
- * This is called from hfi1_rc_rcv() to process an incoming RC response
- * packet for the given QP.
- * Called at interrupt level.
- */
-static void rc_rcv_resp(struct hfi1_ibport *ibp,
-                       struct hfi1_other_headers *ohdr,
-                       void *data, u32 tlen, struct rvt_qp *qp,
-                       u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
-                       struct hfi1_ctxtdata *rcd)
-{
-       struct rvt_swqe *wqe;
-       enum ib_wc_status status;
-       unsigned long flags;
-       int diff;
-       u32 pad;
-       u32 aeth;
-       u64 val;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       trace_hfi1_rc_ack(qp, psn);
-
-       /* Ignore invalid responses. */
-       smp_read_barrier_depends(); /* see post_one_send */
-       if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
-               goto ack_done;
-
-       /* Ignore duplicate responses. */
-       diff = cmp_psn(psn, qp->s_last_psn);
-       if (unlikely(diff <= 0)) {
-               /* Update credits for "ghost" ACKs */
-               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-                       if ((aeth >> 29) == 0)
-                               hfi1_get_credit(qp, aeth);
-               }
-               goto ack_done;
-       }
-
-       /*
-        * Skip everything other than the PSN we expect, if we are waiting
-        * for a reply to a restarted RDMA read or atomic op.
-        */
-       if (qp->r_flags & RVT_R_RDMAR_SEQ) {
-               if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
-                       goto ack_done;
-               qp->r_flags &= ~RVT_R_RDMAR_SEQ;
-       }
-
-       if (unlikely(qp->s_acked == qp->s_tail))
-               goto ack_done;
-       wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-       status = IB_WC_SUCCESS;
-
-       switch (opcode) {
-       case OP(ACKNOWLEDGE):
-       case OP(ATOMIC_ACKNOWLEDGE):
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-                       __be32 *p = ohdr->u.at.atomic_ack_eth;
-
-                       val = ((u64)be32_to_cpu(p[0]) << 32) |
-                               be32_to_cpu(p[1]);
-               } else {
-                       val = 0;
-               }
-               if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
-                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
-                       goto ack_done;
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_middle;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /* no AETH, no ACK */
-               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
-                       goto ack_seq_err;
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-read_middle:
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto ack_len_err;
-               if (unlikely(pmtu >= qp->s_rdma_read_len))
-                       goto ack_len_err;
-
-               /*
-                * We got a response so update the timeout.
-                * 4.096 usec. * (1 << qp->timeout)
-                */
-               qp->s_flags |= RVT_S_TIMER;
-               mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
-               if (qp->s_flags & RVT_S_WAIT_ACK) {
-                       qp->s_flags &= ~RVT_S_WAIT_ACK;
-                       hfi1_schedule_send(qp);
-               }
-
-               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
-                       qp->s_retry = qp->s_retry_cnt;
-
-               /*
-                * Update the RDMA receive state but do the copy w/o
-                * holding the locks and blocking interrupts.
-                */
-               qp->s_rdma_read_len -= pmtu;
-               update_last_psn(qp, psn);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
-               goto bail;
-
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
-                       goto ack_done;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 0 && <= pmtu.
-                * Remember to account for ICRC (4).
-                */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto ack_len_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_last;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /* ACKs READ req. */
-               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
-                       goto ack_seq_err;
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 1 && <= pmtu.
-                * Remember to account for ICRC (4).
-                */
-               if (unlikely(tlen <= (hdrsize + pad + 4)))
-                       goto ack_len_err;
-read_last:
-               tlen -= hdrsize + pad + 4;
-               if (unlikely(tlen != qp->s_rdma_read_len))
-                       goto ack_len_err;
-               aeth = be32_to_cpu(ohdr->u.aeth);
-               hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
-               WARN_ON(qp->s_rdma_read_sge.num_sge);
-               (void)do_rc_ack(qp, aeth, psn,
-                                OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
-               goto ack_done;
-       }
-
-ack_op_err:
-       status = IB_WC_LOC_QP_OP_ERR;
-       goto ack_err;
-
-ack_seq_err:
-       rdma_seq_err(qp, ibp, psn, rcd);
-       goto ack_done;
-
-ack_len_err:
-       status = IB_WC_LOC_LEN_ERR;
-ack_err:
-       if (qp->s_last == qp->s_acked) {
-               hfi1_send_complete(qp, wqe, status);
-               rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-       }
-ack_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-bail:
-       return;
-}
-
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
-                                 struct rvt_qp *qp)
-{
-       if (list_empty(&qp->rspwait)) {
-               qp->r_flags |= RVT_R_RSP_NAK;
-               atomic_inc(&qp->refcount);
-               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
-       }
-}
-
-static inline void rc_cancel_ack(struct rvt_qp *qp)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       priv->r_adefered = 0;
-       if (list_empty(&qp->rspwait))
-               return;
-       list_del_init(&qp->rspwait);
-       qp->r_flags &= ~RVT_R_RSP_NAK;
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-/**
- * rc_rcv_error - process an incoming duplicate or error RC packet
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @diff: the difference between the PSN and the expected PSN
- *
- * This is called from hfi1_rc_rcv() to process an unexpected
- * incoming RC packet for the given QP.
- * Called at interrupt level.
- * Return 1 if no more processing is needed; otherwise return 0 to
- * schedule a response to be sent.
- */
-static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
-                                struct rvt_qp *qp, u32 opcode, u32 psn,
-                                int diff, struct hfi1_ctxtdata *rcd)
-{
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct rvt_ack_entry *e;
-       unsigned long flags;
-       u8 i, prev;
-       int old_req;
-
-       trace_hfi1_rc_rcv_error(qp, psn);
-       if (diff > 0) {
-               /*
-                * Packet sequence error.
-                * A NAK will ACK earlier sends and RDMA writes.
-                * Don't queue the NAK if we already sent one.
-                */
-               if (!qp->r_nak_state) {
-                       ibp->rvp.n_rc_seqnak++;
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       /*
-                        * Wait to send the sequence NAK until all packets
-                        * in the receive queue have been processed.
-                        * Otherwise, we end up propagating congestion.
-                        */
-                       rc_defered_ack(rcd, qp);
-               }
-               goto done;
-       }
-
-       /*
-        * Handle a duplicate request.  Don't re-execute SEND, RDMA
-        * write or atomic op.  Don't NAK errors, just silently drop
-        * the duplicate request.  Note that r_sge, r_len, and
-        * r_rcv_len may be in use so don't modify them.
-        *
-        * We are supposed to ACK the earliest duplicate PSN but we
-        * can coalesce an outstanding duplicate ACK.  We have to
-        * send the earliest so that RDMA reads can be restarted at
-        * the requester's expected PSN.
-        *
-        * First, find where this duplicate PSN falls within the
-        * ACKs previously sent.
-        * old_req is true if there is an older response that is scheduled
-        * to be sent before sending this one.
-        */
-       e = NULL;
-       old_req = 1;
-       ibp->rvp.n_rc_dupreq++;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       for (i = qp->r_head_ack_queue; ; i = prev) {
-               if (i == qp->s_tail_ack_queue)
-                       old_req = 0;
-               if (i)
-                       prev = i - 1;
-               else
-                       prev = HFI1_MAX_RDMA_ATOMIC;
-               if (prev == qp->r_head_ack_queue) {
-                       e = NULL;
-                       break;
-               }
-               e = &qp->s_ack_queue[prev];
-               if (!e->opcode) {
-                       e = NULL;
-                       break;
-               }
-               if (cmp_psn(psn, e->psn) >= 0) {
-                       if (prev == qp->s_tail_ack_queue &&
-                           cmp_psn(psn, e->lpsn) <= 0)
-                               old_req = 0;
-                       break;
-               }
-       }
-       switch (opcode) {
-       case OP(RDMA_READ_REQUEST): {
-               struct ib_reth *reth;
-               u32 offset;
-               u32 len;
-
-               /*
-                * If we didn't find the RDMA read request in the ack queue,
-                * we can ignore this request.
-                */
-               if (!e || e->opcode != OP(RDMA_READ_REQUEST))
-                       goto unlock_done;
-               /* RETH comes after BTH */
-               reth = &ohdr->u.rc.reth;
-               /*
-                * Address range must be a subset of the original
-                * request and start on pmtu boundaries.
-                * We reuse the old ack_queue slot since the requester
-                * should not back up and request an earlier PSN for the
-                * same request.
-                */
-               offset = delta_psn(psn, e->psn) * qp->pmtu;
-               len = be32_to_cpu(reth->length);
-               if (unlikely(offset + len != e->rdma_sge.sge_length))
-                       goto unlock_done;
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               if (len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
-                                        IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto unlock_done;
-               } else {
-                       e->rdma_sge.vaddr = NULL;
-                       e->rdma_sge.length = 0;
-                       e->rdma_sge.sge_length = 0;
-               }
-               e->psn = psn;
-               if (old_req)
-                       goto unlock_done;
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               /*
-                * If we didn't find the atomic request in the ack queue
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != (u8)opcode || old_req)
-                       goto unlock_done;
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       default:
-               /*
-                * Ignore this operation if it doesn't request an ACK
-                * or an earlier RDMA read or atomic is going to be resent.
-                */
-               if (!(psn & IB_BTH_REQ_ACK) || old_req)
-                       goto unlock_done;
-               /*
-                * Resend the most recent ACK if this request is
-                * after all the previous RDMA reads and atomics.
-                */
-               if (i == qp->r_head_ack_queue) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->r_psn - 1;
-                       goto send_ack;
-               }
-
-               /*
-                * Resend the RDMA read or atomic op which
-                * ACKs this duplicate request.
-                */
-               qp->s_tail_ack_queue = i;
-               break;
-       }
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-       qp->s_flags |= RVT_S_RESP_PENDING;
-       qp->r_nak_state = 0;
-       hfi1_schedule_send(qp);
-
-unlock_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return 1;
-
-send_ack:
-       return 0;
-}
-
-void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
-{
-       unsigned long flags;
-       int lastwqe;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       lastwqe = rvt_error_qp(qp, err);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
-       unsigned next;
-
-       next = n + 1;
-       if (next > HFI1_MAX_RDMA_ATOMIC)
-               next = 0;
-       qp->s_tail_ack_queue = next;
-       qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
-static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
-                         u32 lqpn, u32 rqpn, u8 svc_type)
-{
-       struct opa_hfi1_cong_log_event_internal *cc_event;
-       unsigned long flags;
-
-       if (sl >= OPA_MAX_SLS)
-               return;
-
-       spin_lock_irqsave(&ppd->cc_log_lock, flags);
-
-       ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
-       ppd->threshold_event_counter++;
-
-       cc_event = &ppd->cc_events[ppd->cc_log_idx++];
-       if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
-               ppd->cc_log_idx = 0;
-       cc_event->lqpn = lqpn & RVT_QPN_MASK;
-       cc_event->rqpn = rqpn & RVT_QPN_MASK;
-       cc_event->sl = sl;
-       cc_event->svc_type = svc_type;
-       cc_event->rlid = rlid;
-       /* keep timestamp in units of 1.024 usec */
-       cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
-
-       spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
-}
-
-void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
-                 u32 rqpn, u8 svc_type)
-{
-       struct cca_timer *cca_timer;
-       u16 ccti, ccti_incr, ccti_timer, ccti_limit;
-       u8 trigger_threshold;
-       struct cc_state *cc_state;
-       unsigned long flags;
-
-       if (sl >= OPA_MAX_SLS)
-               return;
-
-       cc_state = get_cc_state(ppd);
-
-       if (!cc_state)
-               return;
-
-       /*
-        * 1) increase CCTI (for this SL)
-        * 2) select IPG (i.e., call set_link_ipg())
-        * 3) start timer
-        */
-       ccti_limit = cc_state->cct.ccti_limit;
-       ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
-       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
-       trigger_threshold =
-               cc_state->cong_setting.entries[sl].trigger_threshold;
-
-       spin_lock_irqsave(&ppd->cca_timer_lock, flags);
-
-       cca_timer = &ppd->cca_timer[sl];
-       if (cca_timer->ccti < ccti_limit) {
-               if (cca_timer->ccti + ccti_incr <= ccti_limit)
-                       cca_timer->ccti += ccti_incr;
-               else
-                       cca_timer->ccti = ccti_limit;
-               set_link_ipg(ppd);
-       }
-
-       ccti = cca_timer->ccti;
-
-       if (!hrtimer_active(&cca_timer->hrtimer)) {
-               /* ccti_timer is in units of 1.024 usec */
-               unsigned long nsec = 1024 * ccti_timer;
-
-               hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
-                             HRTIMER_MODE_REL);
-       }
-
-       spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
-
-       if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
-               log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
-}
-
-/**
- * hfi1_rc_rcv - process an incoming RC packet
- * @rcd: the context pointer
- * @hdr: the header of this packet
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- *
- * This is called from qp_rcv() to process an incoming RC packet
- * for the given QP.
- * May be called at interrupt level.
- */
-void hfi1_rc_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       u32 bth0, opcode;
-       u32 hdrsize = packet->hlen;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = qp->pmtu;
-       int diff;
-       struct ib_reth *reth;
-       unsigned long flags;
-       u32 bth1;
-       int ret, is_fecn = 0;
-       int copy_last = 0;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
-               return;
-
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       u16 rlid = qp->remote_ah_attr.dlid;
-                       u32 lqpn, rqpn;
-
-                       lqpn = qp->ibqp.qp_num;
-                       rqpn = qp->remote_qpn;
-                       process_becn(
-                               ppd,
-                               qp->remote_ah_attr.sl,
-                               rlid, lqpn, rqpn,
-                               IB_CC_SVCTYPE_RC);
-               }
-               is_fecn = bth1 & HFI1_FECN_SMASK;
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /*
-        * Process responses (ACKs) before anything else.  Note that the
-        * packet sequence number will be for something in the send work
-        * queue rather than the expected receive packet sequence number.
-        * In other words, this QP is the requester.
-        */
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
-                           hdrsize, pmtu, rcd);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       /* Compute 24 bits worth of difference. */
-       diff = delta_psn(psn, qp->r_psn);
-       if (unlikely(diff)) {
-               if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
-                       return;
-               goto send_ack;
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       default:
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       goto nack_inv;
-               /*
-                * Note that it is up to the requester to not send a new
-                * RDMA read or atomic operation before receiving an ACK
-                * for the previous operation.
-                */
-               break;
-       }
-
-       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
-               qp_comm_est(qp);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-       case OP(RDMA_WRITE_MIDDLE):
-send_middle:
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto nack_inv;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto nack_inv;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               /* consume RWQE */
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto no_immediate_data;
-               /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-send_last_imm:
-               wc.ex.imm_data = ohdr->u.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-       case OP(RDMA_WRITE_LAST):
-               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
-               /* fall through */
-       case OP(SEND_LAST):
-no_immediate_data:
-               wc.wc_flags = 0;
-               wc.ex.imm_data = 0;
-send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (bth0 >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto nack_inv;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto nack_inv;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
-               rvt_put_ss(&qp->r_sge);
-               qp->r_msn++;
-               if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-                       break;
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               else
-                       wc.opcode = IB_WC_RECV;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               /*
-                * It seems that IB mandates the presence of an SL in a
-                * work completion only for the UD transport (see section
-                * 11.4.2 of IBTA Vol. 1).
-                *
-                * However, the way the SL is chosen below is consistent
-                * with the way that IB/qib works and is trying avoid
-                * introducing incompatibilities.
-                *
-                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
-                */
-               wc.sl = qp->remote_ah_attr.sl;
-               /* zero fields that are N/A */
-               wc.vendor_err = 0;
-               wc.pkey_index = 0;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
-               /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            (bth0 & IB_BTH_SOLICITED) != 0);
-               break;
-
-       case OP(RDMA_WRITE_ONLY):
-               copy_last = 1;
-               /* fall through */
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto nack_inv;
-               /* consume RWQE */
-               reth = &ohdr->u.rc.reth;
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               qp->r_sge.sg_list = NULL;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
-                                        rkey, IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto nack_acc;
-                       qp->r_sge.num_sge = 1;
-               } else {
-                       qp->r_sge.num_sge = 0;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_FIRST))
-                       goto send_middle;
-               else if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto no_immediate_data;
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto nack_op_err;
-               if (!ret)
-                       goto rnr_nak;
-               wc.ex.imm_data = ohdr->u.rc.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-
-       case OP(RDMA_READ_REQUEST): {
-               struct rvt_ack_entry *e;
-               u32 len;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
-               if (next > HFI1_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               reth = &ohdr->u.rc.reth;
-               len = be32_to_cpu(reth->length);
-               if (len) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
-                                        rkey, IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto nack_acc_unlck;
-                       /*
-                        * Update the next expected PSN.  We add 1 later
-                        * below, so only add the remainder here.
-                        */
-                       if (len > pmtu)
-                               qp->r_psn += (len - 1) / pmtu;
-               } else {
-                       e->rdma_sge.mr = NULL;
-                       e->rdma_sge.vaddr = NULL;
-                       e->rdma_sge.length = 0;
-                       e->rdma_sge.sge_length = 0;
-               }
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               e->lpsn = qp->r_psn;
-               /*
-                * We need to increment the MSN here instead of when we
-                * finish sending the result since a duplicate request would
-                * increment it more than once.
-                */
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               qp->s_flags |= RVT_S_RESP_PENDING;
-               hfi1_schedule_send(qp);
-
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               struct ib_atomic_eth *ateth;
-               struct rvt_ack_entry *e;
-               u64 vaddr;
-               atomic64_t *maddr;
-               u64 sdata;
-               u32 rkey;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > HFI1_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
-               ateth = &ohdr->u.atomic_eth;
-               vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) |
-                       be32_to_cpu(ateth->vaddr[1]);
-               if (unlikely(vaddr & (sizeof(u64) - 1)))
-                       goto nack_inv_unlck;
-               rkey = be32_to_cpu(ateth->rkey);
-               /* Check rkey & NAK */
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-                                         vaddr, rkey,
-                                         IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc_unlck;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-               sdata = be64_to_cpu(ateth->swap_data);
-               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
-                       (u64)atomic64_add_return(sdata, maddr) - sdata :
-                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-                                     be64_to_cpu(ateth->compare_data),
-                                     sdata);
-               rvt_put_mr(qp->r_sge.sge.mr);
-               qp->r_sge.num_sge = 0;
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               e->lpsn = psn;
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               qp->s_flags |= RVT_S_RESP_PENDING;
-               hfi1_schedule_send(qp);
-
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               if (is_fecn)
-                       goto send_ack;
-               return;
-       }
-
-       default:
-               /* NAK unknown opcodes. */
-               goto nack_inv;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       qp->r_ack_psn = psn;
-       qp->r_nak_state = 0;
-       /* Send an ACK if requested or required. */
-       if (psn & IB_BTH_REQ_ACK) {
-               struct hfi1_qp_priv *priv = qp->priv;
-
-               if (packet->numpkt == 0) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               if (priv->r_adefered >= HFI1_PSN_CREDIT) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               if (unlikely(is_fecn)) {
-                       rc_cancel_ack(qp);
-                       goto send_ack;
-               }
-               priv->r_adefered++;
-               rc_defered_ack(rcd, qp);
-       }
-       return;
-
-rnr_nak:
-       qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue RNR NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_op_err:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_inv_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_inv:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
-       qp->r_ack_psn = qp->r_psn;
-       /* Queue NAK for later */
-       rc_defered_ack(rcd, qp);
-       return;
-
-nack_acc_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_acc:
-       hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-send_ack:
-       hfi1_send_rc_ack(rcd, qp, is_fecn);
-}
-
-void hfi1_rc_hdrerr(
-       struct hfi1_ctxtdata *rcd,
-       struct hfi1_ib_header *hdr,
-       u32 rcv_flags,
-       struct rvt_qp *qp)
-{
-       int has_grh = rcv_flags & HFI1_HAS_GRH;
-       struct hfi1_other_headers *ohdr;
-       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       int diff;
-       u32 opcode;
-       u32 psn, bth0;
-
-       /* Check for GRH */
-       ohdr = &hdr->u.oth;
-       if (has_grh)
-               ohdr = &hdr->u.l.oth;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
-               return;
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /* Only deal with RDMA Writes for now */
-       if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
-               diff = delta_psn(psn, qp->r_psn);
-               if (!qp->r_nak_state && diff >= 0) {
-                       ibp->rvp.n_rc_seqnak++;
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       /*
-                        * Wait to send the sequence
-                        * NAK until all packets
-                        * in the receive queue have
-                        * been processed.
-                        * Otherwise, we end up
-                        * propagating congestion.
-                        */
-                       rc_defered_ack(rcd, qp);
-               } /* Out of sequence NAK */
-       } /* QP Request NAKs */
-}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
deleted file mode 100644 (file)
index a659aec..0000000
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-#include "trace.h"
-
-/*
- * Convert the AETH RNR timeout code into the number of microseconds.
- */
-const u32 ib_hfi1_rnr_table[32] = {
-       655360, /* 00: 655.36 */
-       10,     /* 01:    .01 */
-       20,     /* 02     .02 */
-       30,     /* 03:    .03 */
-       40,     /* 04:    .04 */
-       60,     /* 05:    .06 */
-       80,     /* 06:    .08 */
-       120,    /* 07:    .12 */
-       160,    /* 08:    .16 */
-       240,    /* 09:    .24 */
-       320,    /* 0A:    .32 */
-       480,    /* 0B:    .48 */
-       640,    /* 0C:    .64 */
-       960,    /* 0D:    .96 */
-       1280,   /* 0E:   1.28 */
-       1920,   /* 0F:   1.92 */
-       2560,   /* 10:   2.56 */
-       3840,   /* 11:   3.84 */
-       5120,   /* 12:   5.12 */
-       7680,   /* 13:   7.68 */
-       10240,  /* 14:  10.24 */
-       15360,  /* 15:  15.36 */
-       20480,  /* 16:  20.48 */
-       30720,  /* 17:  30.72 */
-       40960,  /* 18:  40.96 */
-       61440,  /* 19:  61.44 */
-       81920,  /* 1A:  81.92 */
-       122880, /* 1B: 122.88 */
-       163840, /* 1C: 163.84 */
-       245760, /* 1D: 245.76 */
-       327680, /* 1E: 327.68 */
-       491520  /* 1F: 491.52 */
-};
-
-/*
- * Validate a RWQE and fill in the SGE state.
- * Return 1 if OK.
- */
-static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
-{
-       int i, j, ret;
-       struct ib_wc wc;
-       struct rvt_lkey_table *rkt;
-       struct rvt_pd *pd;
-       struct rvt_sge_state *ss;
-
-       rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
-       pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
-       ss = &qp->r_sge;
-       ss->sg_list = qp->r_sg_list;
-       qp->r_len = 0;
-       for (i = j = 0; i < wqe->num_sge; i++) {
-               if (wqe->sg_list[i].length == 0)
-                       continue;
-               /* Check LKEY */
-               if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-                                &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-                       goto bad_lkey;
-               qp->r_len += wqe->sg_list[i].length;
-               j++;
-       }
-       ss->num_sge = j;
-       ss->total_len = qp->r_len;
-       ret = 1;
-       goto bail;
-
-bad_lkey:
-       while (j) {
-               struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
-
-               rvt_put_mr(sge->mr);
-       }
-       ss->num_sge = 0;
-       memset(&wc, 0, sizeof(wc));
-       wc.wr_id = wqe->wr_id;
-       wc.status = IB_WC_LOC_PROT_ERR;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       /* Signal solicited completion event. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return -1 if there is a local error, 0 if no RWQE is available,
- * otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
-{
-       unsigned long flags;
-       struct rvt_rq *rq;
-       struct rvt_rwq *wq;
-       struct rvt_srq *srq;
-       struct rvt_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       u32 tail;
-       int ret;
-
-       if (qp->ibqp.srq) {
-               srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       spin_lock_irqsave(&rq->lock, flags);
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       if (unlikely(tail == wq->head)) {
-               ret = 0;
-               goto unlock;
-       }
-       /* Make sure entry is read after head index is read. */
-       smp_rmb();
-       wqe = rvt_get_rwqe_ptr(rq, tail);
-       /*
-        * Even though we update the tail index in memory, the verbs
-        * consumer is not supposed to post more entries until a
-        * completion is generated.
-        */
-       if (++tail >= rq->size)
-               tail = 0;
-       wq->tail = tail;
-       if (!wr_id_only && !init_sge(qp, wqe)) {
-               ret = -1;
-               goto unlock;
-       }
-       qp->r_wr_id = wqe->wr_id;
-
-       ret = 1;
-       set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
-       if (handler) {
-               u32 n;
-
-               /*
-                * Validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-                       goto bail;
-               }
-       }
-unlock:
-       spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-       return ret;
-}
-
-static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
-{
-       if (!index) {
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-               return cpu_to_be64(ppd->guid);
-       }
-       return ibp->guids[index - 1];
-}
-
-static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
-{
-       return (gid->global.interface_id == id &&
-               (gid->global.subnet_prefix == gid_prefix ||
-                gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
-}
-
-/*
- *
- * This should be called with the QP r_lock held.
- *
- * The s_lock will be acquired around the hfi1_migrate_qp() call.
- */
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
-                      int has_grh, struct rvt_qp *qp, u32 bth0)
-{
-       __be64 guid;
-       unsigned long flags;
-       u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-
-       if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
-               if (!has_grh) {
-                       if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
-                               goto err;
-               } else {
-                       if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
-                               goto err;
-                       guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
-                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
-                                   guid))
-                               goto err;
-                       if (!gid_ok(
-                               &hdr->u.l.grh.sgid,
-                               qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
-                               qp->alt_ah_attr.grh.dgid.global.interface_id))
-                               goto err;
-               }
-               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
-                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                      (u16)bth0,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      0, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       goto err;
-               }
-               /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
-               if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
-                   ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
-                       goto err;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_migrate_qp(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       } else {
-               if (!has_grh) {
-                       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-                               goto err;
-               } else {
-                       if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
-                               goto err;
-                       guid = get_sguid(ibp,
-                                        qp->remote_ah_attr.grh.sgid_index);
-                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
-                                   guid))
-                               goto err;
-                       if (!gid_ok(
-                            &hdr->u.l.grh.sgid,
-                            qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
-                            qp->remote_ah_attr.grh.dgid.global.interface_id))
-                               goto err;
-               }
-               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
-                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                      (u16)bth0,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      0, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       goto err;
-               }
-               /* Validate the SLID. See Ch. 9.6.1.5 */
-               if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
-                   ppd_from_ibp(ibp)->port != qp->port_num)
-                       goto err;
-               if (qp->s_mig_state == IB_MIG_REARM &&
-                   !(bth0 & IB_BTH_MIG_REQ))
-                       qp->s_mig_state = IB_MIG_ARMED;
-       }
-
-       return 0;
-
-err:
-       return 1;
-}
-
-/**
- * ruc_loopback - handle UC and RC loopback requests
- * @sqp: the sending QP
- *
- * This is called from hfi1_do_send() to
- * forward a WQE addressed to the same HFI.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ruc_loopback(struct rvt_qp *sqp)
-{
-       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-       struct rvt_qp *qp;
-       struct rvt_swqe *wqe;
-       struct rvt_sge *sge;
-       unsigned long flags;
-       struct ib_wc wc;
-       u64 sdata;
-       atomic64_t *maddr;
-       enum ib_wc_status send_status;
-       int release;
-       int ret;
-       int copy_last = 0;
-       u32 to;
-
-       rcu_read_lock();
-
-       /*
-        * Note that we check the responder QP state after
-        * checking the requester's state.
-        */
-       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-                           sqp->remote_qpn);
-
-       spin_lock_irqsave(&sqp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
-           !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               goto unlock;
-
-       sqp->s_flags |= RVT_S_BUSY;
-
-again:
-       smp_read_barrier_depends(); /* see post_one_send() */
-       if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
-               goto clr_busy;
-       wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
-       /* Return if it is not OK to start a new work request. */
-       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
-                       goto clr_busy;
-               /* We are in the error state, flush the work request. */
-               send_status = IB_WC_WR_FLUSH_ERR;
-               goto flush_send;
-       }
-
-       /*
-        * We can rely on the entry not changing without the s_lock
-        * being held until we update s_last.
-        * We increment s_cur to indicate s_last is in progress.
-        */
-       if (sqp->s_last == sqp->s_cur) {
-               if (++sqp->s_cur >= sqp->s_size)
-                       sqp->s_cur = 0;
-       }
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-       if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
-           qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-               ibp->rvp.n_pkt_drops++;
-               /*
-                * For RC, the requester would timeout and retry so
-                * shortcut the timeouts and just signal too many retries.
-                */
-               if (sqp->ibqp.qp_type == IB_QPT_RC)
-                       send_status = IB_WC_RETRY_EXC_ERR;
-               else
-                       send_status = IB_WC_SUCCESS;
-               goto serr;
-       }
-
-       memset(&wc, 0, sizeof(wc));
-       send_status = IB_WC_SUCCESS;
-
-       release = 1;
-       sqp->s_sge.sge = wqe->sg_list[0];
-       sqp->s_sge.sg_list = wqe->sg_list + 1;
-       sqp->s_sge.num_sge = wqe->wr.num_sge;
-       sqp->s_len = wqe->length;
-       switch (wqe->wr.opcode) {
-       case IB_WR_SEND_WITH_IMM:
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               /* FALLTHROUGH */
-       case IB_WR_SEND:
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0)
-                       goto op_err;
-               if (!ret)
-                       goto rnr_nak;
-               break;
-
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               ret = hfi1_rvt_get_rwqe(qp, 1);
-               if (ret < 0)
-                       goto op_err;
-               if (!ret)
-                       goto rnr_nak;
-               /* skip copy_last set and qp_access_flags recheck */
-               goto do_write;
-       case IB_WR_RDMA_WRITE:
-               copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-do_write:
-               if (wqe->length == 0)
-                       break;
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
-                                         wqe->rdma_wr.remote_addr,
-                                         wqe->rdma_wr.rkey,
-                                         IB_ACCESS_REMOTE_WRITE)))
-                       goto acc_err;
-               qp->r_sge.sg_list = NULL;
-               qp->r_sge.num_sge = 1;
-               qp->r_sge.total_len = wqe->length;
-               break;
-
-       case IB_WR_RDMA_READ:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto inv_err;
-               if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
-                                         wqe->rdma_wr.remote_addr,
-                                         wqe->rdma_wr.rkey,
-                                         IB_ACCESS_REMOTE_READ)))
-                       goto acc_err;
-               release = 0;
-               sqp->s_sge.sg_list = NULL;
-               sqp->s_sge.num_sge = 1;
-               qp->r_sge.sge = wqe->sg_list[0];
-               qp->r_sge.sg_list = wqe->sg_list + 1;
-               qp->r_sge.num_sge = wqe->wr.num_sge;
-               qp->r_sge.total_len = wqe->length;
-               break;
-
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto inv_err;
-               if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-                                         wqe->atomic_wr.remote_addr,
-                                         wqe->atomic_wr.rkey,
-                                         IB_ACCESS_REMOTE_ATOMIC)))
-                       goto acc_err;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-               sdata = wqe->atomic_wr.compare_add;
-               *(u64 *)sqp->s_sge.sge.vaddr =
-                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-                       (u64)atomic64_add_return(sdata, maddr) - sdata :
-                       (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-                                     sdata, wqe->atomic_wr.swap);
-               rvt_put_mr(qp->r_sge.sge.mr);
-               qp->r_sge.num_sge = 0;
-               goto send_comp;
-
-       default:
-               send_status = IB_WC_LOC_QP_OP_ERR;
-               goto serr;
-       }
-
-       sge = &sqp->s_sge.sge;
-       while (sqp->s_len) {
-               u32 len = sqp->s_len;
-
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (!release)
-                               rvt_put_mr(sge->mr);
-                       if (--sqp->s_sge.num_sge)
-                               *sge = *sqp->s_sge.sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               sqp->s_len -= len;
-       }
-       if (release)
-               rvt_put_ss(&qp->r_sge);
-
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               goto send_comp;
-
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-       else
-               wc.opcode = IB_WC_RECV;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.byte_len = wqe->length;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = qp->remote_qpn;
-       wc.slid = qp->remote_ah_attr.dlid;
-       wc.sl = qp->remote_ah_attr.sl;
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       ibp->rvp.n_loop_pkts++;
-flush_send:
-       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-       hfi1_send_complete(sqp, wqe, send_status);
-       goto again;
-
-rnr_nak:
-       /* Handle RNR NAK */
-       if (qp->ibqp.qp_type == IB_QPT_UC)
-               goto send_comp;
-       ibp->rvp.n_rnr_naks++;
-       /*
-        * Note: we don't need the s_lock held since the BUSY flag
-        * makes this single threaded.
-        */
-       if (sqp->s_rnr_retry == 0) {
-               send_status = IB_WC_RNR_RETRY_EXC_ERR;
-               goto serr;
-       }
-       if (sqp->s_rnr_retry_cnt < 7)
-               sqp->s_rnr_retry--;
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
-               goto clr_busy;
-       to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
-       hfi1_add_rnr_timer(sqp, to);
-       goto clr_busy;
-
-op_err:
-       send_status = IB_WC_REM_OP_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-inv_err:
-       send_status = IB_WC_REM_INV_REQ_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-acc_err:
-       send_status = IB_WC_REM_ACCESS_ERR;
-       wc.status = IB_WC_LOC_PROT_ERR;
-err:
-       /* responder goes to error state */
-       hfi1_rc_error(qp, wc.status);
-
-serr:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       hfi1_send_complete(sqp, wqe, send_status);
-       if (sqp->ibqp.qp_type == IB_QPT_RC) {
-               int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-               sqp->s_flags &= ~RVT_S_BUSY;
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               if (lastwqe) {
-                       struct ib_event ev;
-
-                       ev.device = sqp->ibqp.device;
-                       ev.element.qp = &sqp->ibqp;
-                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-               }
-               goto done;
-       }
-clr_busy:
-       sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-       rcu_read_unlock();
-}
-
-/**
- * hfi1_make_grh - construct a GRH header
- * @ibp: a pointer to the IB port
- * @hdr: a pointer to the GRH header being constructed
- * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
- * @nwords: the number of 32 bit words of data being sent
- *
- * Return the size of the header in 32 bit words.
- */
-u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
-                 struct ib_global_route *grh, u32 hwords, u32 nwords)
-{
-       hdr->version_tclass_flow =
-               cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
-                           (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
-                           (grh->flow_label << IB_GRH_FLOW_SHIFT));
-       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
-       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
-       hdr->next_hdr = IB_GRH_NEXT_HDR;
-       hdr->hop_limit = grh->hop_limit;
-       /* The SGID is 32-bit aligned. */
-       hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
-       hdr->sgid.global.interface_id =
-               grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
-               ibp->guids[grh->sgid_index - 1] :
-                       cpu_to_be64(ppd_from_ibp(ibp)->guid);
-       hdr->dgid = grh->dgid;
-
-       /* GRH header size in 32-bit words. */
-       return sizeof(struct ib_grh) / sizeof(u32);
-}
-
-#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
-
-/**
- * build_ahg - create ahg in s_hdr
- * @qp: a pointer to QP
- * @npsn: the next PSN for the request/response
- *
- * This routine handles the AHG by allocating an ahg entry and causing the
- * copy of the first middle.
- *
- * Subsequent middles use the copied entry, editing the
- * PSN with 1 or 2 edits.
- */
-static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
-               clear_ahg(qp);
-       if (!(qp->s_flags & RVT_S_AHG_VALID)) {
-               /* first middle that needs copy  */
-               if (qp->s_ahgidx < 0)
-                       qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
-               if (qp->s_ahgidx >= 0) {
-                       qp->s_ahgpsn = npsn;
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
-                       /* save to protect a change in another thread */
-                       priv->s_hdr->sde = priv->s_sde;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       qp->s_flags |= RVT_S_AHG_VALID;
-               }
-       } else {
-               /* subsequent middle after valid */
-               if (qp->s_ahgidx >= 0) {
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       priv->s_hdr->ahgcount++;
-                       priv->s_hdr->ahgdesc[0] =
-                               sdma_build_ahg_descriptor(
-                                       (__force u16)cpu_to_be16((u16)npsn),
-                                       BTH2_OFFSET,
-                                       16,
-                                       16);
-                       if ((npsn & 0xffff0000) !=
-                                       (qp->s_ahgpsn & 0xffff0000)) {
-                               priv->s_hdr->ahgcount++;
-                               priv->s_hdr->ahgdesc[1] =
-                                       sdma_build_ahg_descriptor(
-                                               (__force u16)cpu_to_be16(
-                                                       (u16)(npsn >> 16)),
-                                               BTH2_OFFSET,
-                                               0,
-                                               16);
-                       }
-               }
-       }
-}
-
-void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
-                         u32 bth0, u32 bth2, int middle,
-                         struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ibport *ibp = ps->ibp;
-       u16 lrh0;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth1;
-
-       /* Construct the header. */
-       extra_bytes = -qp->s_cur_size & 3;
-       nwords = (qp->s_cur_size + extra_bytes) >> 2;
-       lrh0 = HFI1_LRH_BTH;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               qp->s_hdrwords += hfi1_make_grh(ibp,
-                                               &ps->s_txreq->phdr.hdr.u.l.grh,
-                                               &qp->remote_ah_attr.grh,
-                                               qp->s_hdrwords, nwords);
-               lrh0 = HFI1_LRH_GRH;
-               middle = 0;
-       }
-       lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
-       /*
-        * reset s_hdr/AHG fields
-        *
-        * This insures that the ahgentry/ahgcount
-        * are at a non-AHG default to protect
-        * build_verbs_tx_desc() from using
-        * an include ahgidx.
-        *
-        * build_ahg() will modify as appropriate
-        * to use the AHG feature.
-        */
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->sde = NULL;
-       if (qp->s_mig_state == IB_MIG_MIGRATED)
-               bth0 |= IB_BTH_MIG_REQ;
-       else
-               middle = 0;
-       if (middle)
-               build_ahg(qp, bth2);
-       else
-               qp->s_flags &= ~RVT_S_AHG_VALID;
-       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
-       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       ps->s_txreq->phdr.hdr.lrh[2] =
-               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
-                                      qp->remote_ah_attr.src_path_bits);
-       bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
-       bth0 |= extra_bytes << 20;
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       bth1 = qp->remote_qpn;
-       if (qp->s_flags & RVT_S_ECN) {
-               qp->s_flags &= ~RVT_S_ECN;
-               /* we recently received a FECN, so return a BECN */
-               bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
-       }
-       ohdr->bth[1] = cpu_to_be32(bth1);
-       ohdr->bth[2] = cpu_to_be32(bth2);
-}
-
-/* when sending, force a reschedule every one of these periods */
-#define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
-
-void _hfi1_do_send(struct work_struct *work)
-{
-       struct iowait *wait = container_of(work, struct iowait, iowork);
-       struct rvt_qp *qp = iowait_to_qp(wait);
-
-       hfi1_do_send(qp);
-}
-
-/**
- * hfi1_do_send - perform a send on a QP
- * @work: contains a pointer to the QP
- *
- * Process entries in the send work queue until credit or queue is
- * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, two threads could send packets out of order.
- */
-void hfi1_do_send(struct rvt_qp *qp)
-{
-       struct hfi1_pkt_state ps;
-       struct hfi1_qp_priv *priv = qp->priv;
-       int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-       unsigned long timeout;
-       unsigned long timeout_int;
-       int cpu;
-
-       ps.dev = to_idev(qp->ibqp.device);
-       ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
-       ps.ppd = ppd_from_ibp(ps.ibp);
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
-                                                               ) - 1)) ==
-                                ps.ppd->lid)) {
-                       ruc_loopback(qp);
-                       return;
-               }
-               make_req = hfi1_make_rc_req;
-               timeout_int = (qp->timeout_jiffies);
-               break;
-       case IB_QPT_UC:
-               if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
-                                                               ) - 1)) ==
-                                ps.ppd->lid)) {
-                       ruc_loopback(qp);
-                       return;
-               }
-               make_req = hfi1_make_uc_req;
-               timeout_int = SEND_RESCHED_TIMEOUT;
-               break;
-       default:
-               make_req = hfi1_make_ud_req;
-               timeout_int = SEND_RESCHED_TIMEOUT;
-       }
-
-       spin_lock_irqsave(&qp->s_lock, ps.flags);
-
-       /* Return if we are already busy processing a work request. */
-       if (!hfi1_send_ok(qp)) {
-               spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-               return;
-       }
-
-       qp->s_flags |= RVT_S_BUSY;
-
-       timeout = jiffies + (timeout_int) / 8;
-       cpu = priv->s_sde ? priv->s_sde->cpu :
-                       cpumask_first(cpumask_of_node(ps.ppd->dd->node));
-       /* insure a pre-built packet is handled  */
-       ps.s_txreq = get_waiting_verbs_txreq(qp);
-       do {
-               /* Check for a constructed packet to be sent. */
-               if (qp->s_hdrwords != 0) {
-                       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-                       /*
-                        * If the packet cannot be sent now, return and
-                        * the send tasklet will be woken up later.
-                        */
-                       if (hfi1_verbs_send(qp, &ps))
-                               return;
-                       /* Record that s_hdr is empty. */
-                       qp->s_hdrwords = 0;
-                       /* allow other tasks to run */
-                       if (unlikely(time_after(jiffies, timeout))) {
-                               if (workqueue_congested(cpu,
-                                                       ps.ppd->hfi1_wq)) {
-                                       spin_lock_irqsave(
-                                               &qp->s_lock,
-                                               ps.flags);
-                                       qp->s_flags &= ~RVT_S_BUSY;
-                                       hfi1_schedule_send(qp);
-                                       spin_unlock_irqrestore(
-                                               &qp->s_lock,
-                                               ps.flags);
-                                       this_cpu_inc(
-                                               *ps.ppd->dd->send_schedule);
-                                       return;
-                               }
-                               if (!irqs_disabled()) {
-                                       cond_resched();
-                                       this_cpu_inc(
-                                          *ps.ppd->dd->send_schedule);
-                               }
-                               timeout = jiffies + (timeout_int) / 8;
-                       }
-                       spin_lock_irqsave(&qp->s_lock, ps.flags);
-               }
-       } while (make_req(qp, &ps));
-
-       spin_unlock_irqrestore(&qp->s_lock, ps.flags);
-}
-
-/*
- * This should be called with s_lock held.
- */
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-                       enum ib_wc_status status)
-{
-       u32 old_last, last;
-       unsigned i;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       last = qp->s_last;
-       old_last = last;
-       if (++last >= qp->s_size)
-               last = 0;
-       qp->s_last = last;
-       /* See post_send() */
-       barrier();
-       for (i = 0; i < wqe->wr.num_sge; i++) {
-               struct rvt_sge *sge = &wqe->sg_list[i];
-
-               rvt_put_mr(sge->mr);
-       }
-       if (qp->ibqp.qp_type == IB_QPT_UD ||
-           qp->ibqp.qp_type == IB_QPT_SMI ||
-           qp->ibqp.qp_type == IB_QPT_GSI)
-               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
-       /* See ch. 11.2.4.1 and 10.7.3.1 */
-       if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-           status != IB_WC_SUCCESS) {
-               struct ib_wc wc;
-
-               memset(&wc, 0, sizeof(wc));
-               wc.wr_id = wqe->wr.wr_id;
-               wc.status = status;
-               wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-               wc.qp = &qp->ibqp;
-               if (status == IB_WC_SUCCESS)
-                       wc.byte_len = wqe->length;
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-                            status != IB_WC_SUCCESS);
-       }
-
-       if (qp->s_acked == old_last)
-               qp->s_acked = last;
-       if (qp->s_cur == old_last)
-               qp->s_cur = last;
-       if (qp->s_tail == old_last)
-               qp->s_tail = last;
-       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-               qp->s_draining = 0;
-}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
deleted file mode 100644 (file)
index abb8ebc..0000000
+++ /dev/null
@@ -1,3052 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/spinlock.h>
-#include <linux/seqlock.h>
-#include <linux/netdevice.h>
-#include <linux/moduleparam.h>
-#include <linux/bitops.h>
-#include <linux/timer.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "qp.h"
-#include "sdma.h"
-#include "iowait.h"
-#include "trace.h"
-
-/* must be a power of 2 >= 64 <= 32768 */
-#define SDMA_DESCQ_CNT 2048
-#define SDMA_DESC_INTR 64
-#define INVALID_TAIL 0xffff
-
-static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
-module_param(sdma_descq_cnt, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
-
-static uint sdma_idle_cnt = 250;
-module_param(sdma_idle_cnt, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
-
-uint mod_num_sdma;
-module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
-MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
-
-static uint sdma_desct_intr = SDMA_DESC_INTR;
-module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
-
-#define SDMA_WAIT_BATCH_SIZE 20
-/* max wait time for a SDMA engine to indicate it has halted */
-#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
-/* all SDMA engine errors that cause a halt */
-
-#define SD(name) SEND_DMA_##name
-#define ALL_SDMA_ENG_HALT_ERRS \
-       (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
-       | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
-
-/* sdma_sendctrl operations */
-#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
-#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
-#define SDMA_SENDCTRL_OP_HALT      BIT(2)
-#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
-
-/* handle long defines */
-#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
-SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
-#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
-SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
-
-static const char * const sdma_state_names[] = {
-       [sdma_state_s00_hw_down]                = "s00_HwDown",
-       [sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
-       [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
-       [sdma_state_s20_idle]                   = "s20_Idle",
-       [sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
-       [sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
-       [sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
-       [sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
-       [sdma_state_s80_hw_freeze]              = "s80_HwFreeze",
-       [sdma_state_s82_freeze_sw_clean]        = "s82_FreezeSwClean",
-       [sdma_state_s99_running]                = "s99_Running",
-};
-
-static const char * const sdma_event_names[] = {
-       [sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
-       [sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
-       [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
-       [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
-       [sdma_event_e30_go_running]   = "e30_GoRunning",
-       [sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
-       [sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
-       [sdma_event_e60_hw_halted]    = "e60_HwHalted",
-       [sdma_event_e70_go_idle]      = "e70_GoIdle",
-       [sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
-       [sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
-       [sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
-       [sdma_event_e85_link_down]    = "e85_LinkDown",
-       [sdma_event_e90_sw_halted]    = "e90_SwHalted",
-};
-
-static const struct sdma_set_state_action sdma_action_table[] = {
-       [sdma_state_s00_hw_down] = {
-               .go_s99_running_tofalse = 1,
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s10_hw_start_up_halt_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 1,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s15_hw_start_up_clean_wait] = {
-               .op_enable = 0,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 1,
-       },
-       [sdma_state_s20_idle] = {
-               .op_enable = 0,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s30_sw_clean_up_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s40_hw_clean_up_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 1,
-       },
-       [sdma_state_s50_hw_halt_wait] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s60_idle_halt_wait] = {
-               .go_s99_running_tofalse = 1,
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 1,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s80_hw_freeze] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s82_freeze_sw_clean] = {
-               .op_enable = 0,
-               .op_intenable = 0,
-               .op_halt = 0,
-               .op_cleanup = 0,
-       },
-       [sdma_state_s99_running] = {
-               .op_enable = 1,
-               .op_intenable = 1,
-               .op_halt = 0,
-               .op_cleanup = 0,
-               .go_s99_running_totrue = 1,
-       },
-};
-
-#define SDMA_TAIL_UPDATE_THRESH 0x1F
-
-/* declare all statics here rather than keep sorting */
-static void sdma_complete(struct kref *);
-static void sdma_finalput(struct sdma_state *);
-static void sdma_get(struct sdma_state *);
-static void sdma_hw_clean_up_task(unsigned long);
-static void sdma_put(struct sdma_state *);
-static void sdma_set_state(struct sdma_engine *, enum sdma_states);
-static void sdma_start_hw_clean_up(struct sdma_engine *);
-static void sdma_sw_clean_up_task(unsigned long);
-static void sdma_sendctrl(struct sdma_engine *, unsigned);
-static void init_sdma_regs(struct sdma_engine *, u32, uint);
-static void sdma_process_event(
-       struct sdma_engine *sde,
-       enum sdma_events event);
-static void __sdma_process_event(
-       struct sdma_engine *sde,
-       enum sdma_events event);
-static void dump_sdma_state(struct sdma_engine *sde);
-static void sdma_make_progress(struct sdma_engine *sde, u64 status);
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
-static void sdma_flush_descq(struct sdma_engine *sde);
-
-/**
- * sdma_state_name() - return state string from enum
- * @state: state
- */
-static const char *sdma_state_name(enum sdma_states state)
-{
-       return sdma_state_names[state];
-}
-
-static void sdma_get(struct sdma_state *ss)
-{
-       kref_get(&ss->kref);
-}
-
-static void sdma_complete(struct kref *kref)
-{
-       struct sdma_state *ss =
-               container_of(kref, struct sdma_state, kref);
-
-       complete(&ss->comp);
-}
-
-static void sdma_put(struct sdma_state *ss)
-{
-       kref_put(&ss->kref, sdma_complete);
-}
-
-static void sdma_finalput(struct sdma_state *ss)
-{
-       sdma_put(ss);
-       wait_for_completion(&ss->comp);
-}
-
-static inline void write_sde_csr(
-       struct sdma_engine *sde,
-       u32 offset0,
-       u64 value)
-{
-       write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
-}
-
-static inline u64 read_sde_csr(
-       struct sdma_engine *sde,
-       u32 offset0)
-{
-       return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
-}
-
-/*
- * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
- * sdma engine 'sde' to drop to 0.
- */
-static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
-                                       int pause)
-{
-       u64 off = 8 * sde->this_idx;
-       struct hfi1_devdata *dd = sde->dd;
-       int lcnt = 0;
-       u64 reg_prev;
-       u64 reg = 0;
-
-       while (1) {
-               reg_prev = reg;
-               reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
-
-               reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
-               reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
-               if (reg == 0)
-                       break;
-               /* counter is reest if accupancy count changes */
-               if (reg != reg_prev)
-                       lcnt = 0;
-               if (lcnt++ > 500) {
-                       /* timed out - bounce the link */
-                       dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
-                                  __func__, sde->this_idx, (u32)reg);
-                       queue_work(dd->pport->hfi1_wq,
-                                  &dd->pport->link_bounce_work);
-                       break;
-               }
-               udelay(1);
-       }
-}
-
-/*
- * sdma_wait() - wait for packet egress to complete for all SDMA engines,
- * and pause for credit return.
- */
-void sdma_wait(struct hfi1_devdata *dd)
-{
-       int i;
-
-       for (i = 0; i < dd->num_sdma; i++) {
-               struct sdma_engine *sde = &dd->per_sdma[i];
-
-               sdma_wait_for_packet_egress(sde, 0);
-       }
-}
-
-static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
-{
-       u64 reg;
-
-       if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
-               return;
-       reg = cnt;
-       reg &= SD(DESC_CNT_CNT_MASK);
-       reg <<= SD(DESC_CNT_CNT_SHIFT);
-       write_sde_csr(sde, SD(DESC_CNT), reg);
-}
-
-static inline void complete_tx(struct sdma_engine *sde,
-                              struct sdma_txreq *tx,
-                              int res)
-{
-       /* protect against complete modifying */
-       struct iowait *wait = tx->wait;
-       callback_t complete = tx->complete;
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       trace_hfi1_sdma_out_sn(sde, tx->sn);
-       if (WARN_ON_ONCE(sde->head_sn != tx->sn))
-               dd_dev_err(sde->dd, "expected %llu got %llu\n",
-                          sde->head_sn, tx->sn);
-       sde->head_sn++;
-#endif
-       sdma_txclean(sde->dd, tx);
-       if (complete)
-               (*complete)(tx, res);
-       if (iowait_sdma_dec(wait) && wait)
-               iowait_drain_wakeup(wait);
-}
-
-/*
- * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
- *
- * Depending on timing there can be txreqs in two places:
- * - in the descq ring
- * - in the flush list
- *
- * To avoid ordering issues the descq ring needs to be flushed
- * first followed by the flush list.
- *
- * This routine is called from two places
- * - From a work queue item
- * - Directly from the state machine just before setting the
- *   state to running
- *
- * Must be called with head_lock held
- *
- */
-static void sdma_flush(struct sdma_engine *sde)
-{
-       struct sdma_txreq *txp, *txp_next;
-       LIST_HEAD(flushlist);
-       unsigned long flags;
-
-       /* flush from head to tail */
-       sdma_flush_descq(sde);
-       spin_lock_irqsave(&sde->flushlist_lock, flags);
-       /* copy flush list */
-       list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
-               list_del_init(&txp->list);
-               list_add_tail(&txp->list, &flushlist);
-       }
-       spin_unlock_irqrestore(&sde->flushlist_lock, flags);
-       /* flush from flush list */
-       list_for_each_entry_safe(txp, txp_next, &flushlist, list)
-               complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
-}
-
-/*
- * Fields a work request for flushing the descq ring
- * and the flush list
- *
- * If the engine has been brought to running during
- * the scheduling delay, the flush is ignored, assuming
- * that the process of bringing the engine to running
- * would have done this flush prior to going to running.
- *
- */
-static void sdma_field_flush(struct work_struct *work)
-{
-       unsigned long flags;
-       struct sdma_engine *sde =
-               container_of(work, struct sdma_engine, flush_worker);
-
-       write_seqlock_irqsave(&sde->head_lock, flags);
-       if (!__sdma_running(sde))
-               sdma_flush(sde);
-       write_sequnlock_irqrestore(&sde->head_lock, flags);
-}
-
-static void sdma_err_halt_wait(struct work_struct *work)
-{
-       struct sdma_engine *sde = container_of(work, struct sdma_engine,
-                                               err_halt_worker);
-       u64 statuscsr;
-       unsigned long timeout;
-
-       timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
-       while (1) {
-               statuscsr = read_sde_csr(sde, SD(STATUS));
-               statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
-               if (statuscsr)
-                       break;
-               if (time_after(jiffies, timeout)) {
-                       dd_dev_err(sde->dd,
-                                  "SDMA engine %d - timeout waiting for engine to halt\n",
-                                  sde->this_idx);
-                       /*
-                        * Continue anyway.  This could happen if there was
-                        * an uncorrectable error in the wrong spot.
-                        */
-                       break;
-               }
-               usleep_range(80, 120);
-       }
-
-       sdma_process_event(sde, sdma_event_e15_hw_halt_done);
-}
-
-static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
-{
-       if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
-               unsigned index;
-               struct hfi1_devdata *dd = sde->dd;
-
-               for (index = 0; index < dd->num_sdma; index++) {
-                       struct sdma_engine *curr_sdma = &dd->per_sdma[index];
-
-                       if (curr_sdma != sde)
-                               curr_sdma->progress_check_head =
-                                                       curr_sdma->descq_head;
-               }
-               dd_dev_err(sde->dd,
-                          "SDMA engine %d - check scheduled\n",
-                               sde->this_idx);
-               mod_timer(&sde->err_progress_check_timer, jiffies + 10);
-       }
-}
-
-static void sdma_err_progress_check(unsigned long data)
-{
-       unsigned index;
-       struct sdma_engine *sde = (struct sdma_engine *)data;
-
-       dd_dev_err(sde->dd, "SDE progress check event\n");
-       for (index = 0; index < sde->dd->num_sdma; index++) {
-               struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
-               unsigned long flags;
-
-               /* check progress on each engine except the current one */
-               if (curr_sde == sde)
-                       continue;
-               /*
-                * We must lock interrupts when acquiring sde->lock,
-                * to avoid a deadlock if interrupt triggers and spins on
-                * the same lock on same CPU
-                */
-               spin_lock_irqsave(&curr_sde->tail_lock, flags);
-               write_seqlock(&curr_sde->head_lock);
-
-               /* skip non-running queues */
-               if (curr_sde->state.current_state != sdma_state_s99_running) {
-                       write_sequnlock(&curr_sde->head_lock);
-                       spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
-                       continue;
-               }
-
-               if ((curr_sde->descq_head != curr_sde->descq_tail) &&
-                   (curr_sde->descq_head ==
-                               curr_sde->progress_check_head))
-                       __sdma_process_event(curr_sde,
-                                            sdma_event_e90_sw_halted);
-               write_sequnlock(&curr_sde->head_lock);
-               spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
-       }
-       schedule_work(&sde->err_halt_worker);
-}
-
-static void sdma_hw_clean_up_task(unsigned long opaque)
-{
-       struct sdma_engine *sde = (struct sdma_engine *)opaque;
-       u64 statuscsr;
-
-       while (1) {
-#ifdef CONFIG_SDMA_VERBOSITY
-               dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                          sde->this_idx, slashstrip(__FILE__), __LINE__,
-                       __func__);
-#endif
-               statuscsr = read_sde_csr(sde, SD(STATUS));
-               statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
-               if (statuscsr)
-                       break;
-               udelay(10);
-       }
-
-       sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
-}
-
-static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
-{
-       smp_read_barrier_depends(); /* see sdma_update_tail() */
-       return sde->tx_ring[sde->tx_head & sde->sdma_mask];
-}
-
-/*
- * flush ring for recovery
- */
-static void sdma_flush_descq(struct sdma_engine *sde)
-{
-       u16 head, tail;
-       int progress = 0;
-       struct sdma_txreq *txp = get_txhead(sde);
-
-       /* The reason for some of the complexity of this code is that
-        * not all descriptors have corresponding txps.  So, we have to
-        * be able to skip over descs until we wander into the range of
-        * the next txp on the list.
-        */
-       head = sde->descq_head & sde->sdma_mask;
-       tail = sde->descq_tail & sde->sdma_mask;
-       while (head != tail) {
-               /* advance head, wrap if needed */
-               head = ++sde->descq_head & sde->sdma_mask;
-               /* if now past this txp's descs, do the callback */
-               if (txp && txp->next_descq_idx == head) {
-                       /* remove from list */
-                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
-                       trace_hfi1_sdma_progress(sde, head, tail, txp);
-                       txp = get_txhead(sde);
-               }
-               progress++;
-       }
-       if (progress)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-}
-
-static void sdma_sw_clean_up_task(unsigned long opaque)
-{
-       struct sdma_engine *sde = (struct sdma_engine *)opaque;
-       unsigned long flags;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-
-       /*
-        * At this point, the following should always be true:
-        * - We are halted, so no more descriptors are getting retired.
-        * - We are not running, so no one is submitting new work.
-        * - Only we can send the e40_sw_cleaned, so we can't start
-        *   running again until we say so.  So, the active list and
-        *   descq are ours to play with.
-        */
-
-       /*
-        * In the error clean up sequence, software clean must be called
-        * before the hardware clean so we can use the hardware head in
-        * the progress routine.  A hardware clean or SPC unfreeze will
-        * reset the hardware head.
-        *
-        * Process all retired requests. The progress routine will use the
-        * latest physical hardware head - we are not running so speed does
-        * not matter.
-        */
-       sdma_make_progress(sde, 0);
-
-       sdma_flush(sde);
-
-       /*
-        * Reset our notion of head and tail.
-        * Note that the HW registers have been reset via an earlier
-        * clean up.
-        */
-       sde->descq_tail = 0;
-       sde->descq_head = 0;
-       sde->desc_avail = sdma_descq_freecnt(sde);
-       *sde->head_dma = 0;
-
-       __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
-
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void sdma_sw_tear_down(struct sdma_engine *sde)
-{
-       struct sdma_state *ss = &sde->state;
-
-       /* Releasing this reference means the state machine has stopped. */
-       sdma_put(ss);
-
-       /* stop waiting for all unfreeze events to complete */
-       atomic_set(&sde->dd->sdma_unfreeze_count, -1);
-       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-}
-
-static void sdma_start_hw_clean_up(struct sdma_engine *sde)
-{
-       tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
-}
-
-static void sdma_set_state(struct sdma_engine *sde,
-                          enum sdma_states next_state)
-{
-       struct sdma_state *ss = &sde->state;
-       const struct sdma_set_state_action *action = sdma_action_table;
-       unsigned op = 0;
-
-       trace_hfi1_sdma_state(
-               sde,
-               sdma_state_names[ss->current_state],
-               sdma_state_names[next_state]);
-
-       /* debugging bookkeeping */
-       ss->previous_state = ss->current_state;
-       ss->previous_op = ss->current_op;
-       ss->current_state = next_state;
-
-       if (ss->previous_state != sdma_state_s99_running &&
-           next_state == sdma_state_s99_running)
-               sdma_flush(sde);
-
-       if (action[next_state].op_enable)
-               op |= SDMA_SENDCTRL_OP_ENABLE;
-
-       if (action[next_state].op_intenable)
-               op |= SDMA_SENDCTRL_OP_INTENABLE;
-
-       if (action[next_state].op_halt)
-               op |= SDMA_SENDCTRL_OP_HALT;
-
-       if (action[next_state].op_cleanup)
-               op |= SDMA_SENDCTRL_OP_CLEANUP;
-
-       if (action[next_state].go_s99_running_tofalse)
-               ss->go_s99_running = 0;
-
-       if (action[next_state].go_s99_running_totrue)
-               ss->go_s99_running = 1;
-
-       ss->current_op = op;
-       sdma_sendctrl(sde, ss->current_op);
-}
-
-/**
- * sdma_get_descq_cnt() - called when device probed
- *
- * Return a validated descq count.
- *
- * This is currently only used in the verbs initialization to build the tx
- * list.
- *
- * This will probably be deleted in favor of a more scalable approach to
- * alloc tx's.
- *
- */
-u16 sdma_get_descq_cnt(void)
-{
-       u16 count = sdma_descq_cnt;
-
-       if (!count)
-               return SDMA_DESCQ_CNT;
-       /* count must be a power of 2 greater than 64 and less than
-        * 32768.   Otherwise return default.
-        */
-       if (!is_power_of_2(count))
-               return SDMA_DESCQ_CNT;
-       if (count < 64 || count > 32768)
-               return SDMA_DESCQ_CNT;
-       return count;
-}
-
-/**
- * sdma_select_engine_vl() - select sdma engine
- * @dd: devdata
- * @selector: a spreading factor
- * @vl: this vl
- *
- *
- * This function returns an engine based on the selector and a vl.  The
- * mapping fields are protected by RCU.
- */
-struct sdma_engine *sdma_select_engine_vl(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 vl)
-{
-       struct sdma_vl_map *m;
-       struct sdma_map_elem *e;
-       struct sdma_engine *rval;
-
-       /* NOTE This should only happen if SC->VL changed after the initial
-        *      checks on the QP/AH
-        *      Default will return engine 0 below
-        */
-       if (vl >= num_vls) {
-               rval = NULL;
-               goto done;
-       }
-
-       rcu_read_lock();
-       m = rcu_dereference(dd->sdma_map);
-       if (unlikely(!m)) {
-               rcu_read_unlock();
-               return &dd->per_sdma[0];
-       }
-       e = m->map[vl & m->mask];
-       rval = e->sde[selector & e->mask];
-       rcu_read_unlock();
-
-done:
-       rval =  !rval ? &dd->per_sdma[0] : rval;
-       trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
-       return rval;
-}
-
-/**
- * sdma_select_engine_sc() - select sdma engine
- * @dd: devdata
- * @selector: a spreading factor
- * @sc5: the 5 bit sc
- *
- *
- * This function returns an engine based on the selector and an sc.
- */
-struct sdma_engine *sdma_select_engine_sc(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 sc5)
-{
-       u8 vl = sc_to_vlt(dd, sc5);
-
-       return sdma_select_engine_vl(dd, selector, vl);
-}
-
-/*
- * Free the indicated map struct
- */
-static void sdma_map_free(struct sdma_vl_map *m)
-{
-       int i;
-
-       for (i = 0; m && i < m->actual_vls; i++)
-               kfree(m->map[i]);
-       kfree(m);
-}
-
-/*
- * Handle RCU callback
- */
-static void sdma_map_rcu_callback(struct rcu_head *list)
-{
-       struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
-
-       sdma_map_free(m);
-}
-
-/**
- * sdma_map_init - called when # vls change
- * @dd: hfi1_devdata
- * @port: port number
- * @num_vls: number of vls
- * @vl_engines: per vl engine mapping (optional)
- *
- * This routine changes the mapping based on the number of vls.
- *
- * vl_engines is used to specify a non-uniform vl/engine loading. NULL
- * implies auto computing the loading and giving each VLs a uniform
- * distribution of engines per VL.
- *
- * The auto algorithm computes the sde_per_vl and the number of extra
- * engines.  Any extra engines are added from the last VL on down.
- *
- * rcu locking is used here to control access to the mapping fields.
- *
- * If either the num_vls or num_sdma are non-power of 2, the array sizes
- * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
- * up to the next highest power of 2 and the first entry is reused
- * in a round robin fashion.
- *
- * If an error occurs the map change is not done and the mapping is
- * not changed.
- *
- */
-int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
-{
-       int i, j;
-       int extra, sde_per_vl;
-       int engine = 0;
-       u8 lvl_engines[OPA_MAX_VLS];
-       struct sdma_vl_map *oldmap, *newmap;
-
-       if (!(dd->flags & HFI1_HAS_SEND_DMA))
-               return 0;
-
-       if (!vl_engines) {
-               /* truncate divide */
-               sde_per_vl = dd->num_sdma / num_vls;
-               /* extras */
-               extra = dd->num_sdma % num_vls;
-               vl_engines = lvl_engines;
-               /* add extras from last vl down */
-               for (i = num_vls - 1; i >= 0; i--, extra--)
-                       vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
-       }
-       /* build new map */
-       newmap = kzalloc(
-               sizeof(struct sdma_vl_map) +
-                       roundup_pow_of_two(num_vls) *
-                       sizeof(struct sdma_map_elem *),
-               GFP_KERNEL);
-       if (!newmap)
-               goto bail;
-       newmap->actual_vls = num_vls;
-       newmap->vls = roundup_pow_of_two(num_vls);
-       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
-       /* initialize back-map */
-       for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
-               newmap->engine_to_vl[i] = -1;
-       for (i = 0; i < newmap->vls; i++) {
-               /* save for wrap around */
-               int first_engine = engine;
-
-               if (i < newmap->actual_vls) {
-                       int sz = roundup_pow_of_two(vl_engines[i]);
-
-                       /* only allocate once */
-                       newmap->map[i] = kzalloc(
-                               sizeof(struct sdma_map_elem) +
-                                       sz * sizeof(struct sdma_engine *),
-                               GFP_KERNEL);
-                       if (!newmap->map[i])
-                               goto bail;
-                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
-                       /* assign engines */
-                       for (j = 0; j < sz; j++) {
-                               newmap->map[i]->sde[j] =
-                                       &dd->per_sdma[engine];
-                               if (++engine >= first_engine + vl_engines[i])
-                                       /* wrap back to first engine */
-                                       engine = first_engine;
-                       }
-                       /* assign back-map */
-                       for (j = 0; j < vl_engines[i]; j++)
-                               newmap->engine_to_vl[first_engine + j] = i;
-               } else {
-                       /* just re-use entry without allocating */
-                       newmap->map[i] = newmap->map[i % num_vls];
-               }
-               engine = first_engine + vl_engines[i];
-       }
-       /* newmap in hand, save old map */
-       spin_lock_irq(&dd->sde_map_lock);
-       oldmap = rcu_dereference_protected(dd->sdma_map,
-                                          lockdep_is_held(&dd->sde_map_lock));
-
-       /* publish newmap */
-       rcu_assign_pointer(dd->sdma_map, newmap);
-
-       spin_unlock_irq(&dd->sde_map_lock);
-       /* success, free any old map after grace period */
-       if (oldmap)
-               call_rcu(&oldmap->list, sdma_map_rcu_callback);
-       return 0;
-bail:
-       /* free any partial allocation */
-       sdma_map_free(newmap);
-       return -ENOMEM;
-}
-
-/*
- * Clean up allocated memory.
- *
- * This routine is can be called regardless of the success of sdma_init()
- *
- */
-static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
-{
-       size_t i;
-       struct sdma_engine *sde;
-
-       if (dd->sdma_pad_dma) {
-               dma_free_coherent(&dd->pcidev->dev, 4,
-                                 (void *)dd->sdma_pad_dma,
-                                 dd->sdma_pad_phys);
-               dd->sdma_pad_dma = NULL;
-               dd->sdma_pad_phys = 0;
-       }
-       if (dd->sdma_heads_dma) {
-               dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
-                                 (void *)dd->sdma_heads_dma,
-                                 dd->sdma_heads_phys);
-               dd->sdma_heads_dma = NULL;
-               dd->sdma_heads_phys = 0;
-       }
-       for (i = 0; dd->per_sdma && i < num_engines; ++i) {
-               sde = &dd->per_sdma[i];
-
-               sde->head_dma = NULL;
-               sde->head_phys = 0;
-
-               if (sde->descq) {
-                       dma_free_coherent(
-                               &dd->pcidev->dev,
-                               sde->descq_cnt * sizeof(u64[2]),
-                               sde->descq,
-                               sde->descq_phys
-                       );
-                       sde->descq = NULL;
-                       sde->descq_phys = 0;
-               }
-               kvfree(sde->tx_ring);
-               sde->tx_ring = NULL;
-       }
-       spin_lock_irq(&dd->sde_map_lock);
-       sdma_map_free(rcu_access_pointer(dd->sdma_map));
-       RCU_INIT_POINTER(dd->sdma_map, NULL);
-       spin_unlock_irq(&dd->sde_map_lock);
-       synchronize_rcu();
-       kfree(dd->per_sdma);
-       dd->per_sdma = NULL;
-}
-
-/**
- * sdma_init() - called when device probed
- * @dd: hfi1_devdata
- * @port: port number (currently only zero)
- *
- * sdma_init initializes the specified number of engines.
- *
- * The code initializes each sde, its csrs.  Interrupts
- * are not required to be enabled.
- *
- * Returns:
- * 0 - success, -errno on failure
- */
-int sdma_init(struct hfi1_devdata *dd, u8 port)
-{
-       unsigned this_idx;
-       struct sdma_engine *sde;
-       u16 descq_cnt;
-       void *curr_head;
-       struct hfi1_pportdata *ppd = dd->pport + port;
-       u32 per_sdma_credits;
-       uint idle_cnt = sdma_idle_cnt;
-       size_t num_engines = dd->chip_sdma_engines;
-
-       if (!HFI1_CAP_IS_KSET(SDMA)) {
-               HFI1_CAP_CLEAR(SDMA_AHG);
-               return 0;
-       }
-       if (mod_num_sdma &&
-           /* can't exceed chip support */
-           mod_num_sdma <= dd->chip_sdma_engines &&
-           /* count must be >= vls */
-           mod_num_sdma >= num_vls)
-               num_engines = mod_num_sdma;
-
-       dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
-       dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
-       dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
-                   dd->chip_sdma_mem_size);
-
-       per_sdma_credits =
-               dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
-
-       /* set up freeze waitqueue */
-       init_waitqueue_head(&dd->sdma_unfreeze_wq);
-       atomic_set(&dd->sdma_unfreeze_count, 0);
-
-       descq_cnt = sdma_get_descq_cnt();
-       dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
-                   num_engines, descq_cnt);
-
-       /* alloc memory for array of send engines */
-       dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
-       if (!dd->per_sdma)
-               return -ENOMEM;
-
-       idle_cnt = ns_to_cclock(dd, idle_cnt);
-       if (!sdma_desct_intr)
-               sdma_desct_intr = SDMA_DESC_INTR;
-
-       /* Allocate memory for SendDMA descriptor FIFOs */
-       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
-               sde = &dd->per_sdma[this_idx];
-               sde->dd = dd;
-               sde->ppd = ppd;
-               sde->this_idx = this_idx;
-               sde->descq_cnt = descq_cnt;
-               sde->desc_avail = sdma_descq_freecnt(sde);
-               sde->sdma_shift = ilog2(descq_cnt);
-               sde->sdma_mask = (1 << sde->sdma_shift) - 1;
-
-               /* Create a mask specifically for each interrupt source */
-               sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
-                                          this_idx);
-               sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
-                                               this_idx);
-               sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
-                                           this_idx);
-               /* Create a combined mask to cover all 3 interrupt sources */
-               sde->imask = sde->int_mask | sde->progress_mask |
-                            sde->idle_mask;
-
-               spin_lock_init(&sde->tail_lock);
-               seqlock_init(&sde->head_lock);
-               spin_lock_init(&sde->senddmactrl_lock);
-               spin_lock_init(&sde->flushlist_lock);
-               /* insure there is always a zero bit */
-               sde->ahg_bits = 0xfffffffe00000000ULL;
-
-               sdma_set_state(sde, sdma_state_s00_hw_down);
-
-               /* set up reference counting */
-               kref_init(&sde->state.kref);
-               init_completion(&sde->state.comp);
-
-               INIT_LIST_HEAD(&sde->flushlist);
-               INIT_LIST_HEAD(&sde->dmawait);
-
-               sde->tail_csr =
-                       get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
-
-               if (idle_cnt)
-                       dd->default_desc1 =
-                               SDMA_DESC1_HEAD_TO_HOST_FLAG;
-               else
-                       dd->default_desc1 =
-                               SDMA_DESC1_INT_REQ_FLAG;
-
-               tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
-                            (unsigned long)sde);
-
-               tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
-                            (unsigned long)sde);
-               INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
-               INIT_WORK(&sde->flush_worker, sdma_field_flush);
-
-               sde->progress_check_head = 0;
-
-               setup_timer(&sde->err_progress_check_timer,
-                           sdma_err_progress_check, (unsigned long)sde);
-
-               sde->descq = dma_zalloc_coherent(
-                       &dd->pcidev->dev,
-                       descq_cnt * sizeof(u64[2]),
-                       &sde->descq_phys,
-                       GFP_KERNEL
-               );
-               if (!sde->descq)
-                       goto bail;
-               sde->tx_ring =
-                       kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
-                               GFP_KERNEL);
-               if (!sde->tx_ring)
-                       sde->tx_ring =
-                               vzalloc(
-                                       sizeof(struct sdma_txreq *) *
-                                       descq_cnt);
-               if (!sde->tx_ring)
-                       goto bail;
-       }
-
-       dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
-       /* Allocate memory for DMA of head registers to memory */
-       dd->sdma_heads_dma = dma_zalloc_coherent(
-               &dd->pcidev->dev,
-               dd->sdma_heads_size,
-               &dd->sdma_heads_phys,
-               GFP_KERNEL
-       );
-       if (!dd->sdma_heads_dma) {
-               dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
-               goto bail;
-       }
-
-       /* Allocate memory for pad */
-       dd->sdma_pad_dma = dma_zalloc_coherent(
-               &dd->pcidev->dev,
-               sizeof(u32),
-               &dd->sdma_pad_phys,
-               GFP_KERNEL
-       );
-       if (!dd->sdma_pad_dma) {
-               dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
-               goto bail;
-       }
-
-       /* assign each engine to different cacheline and init registers */
-       curr_head = (void *)dd->sdma_heads_dma;
-       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
-               unsigned long phys_offset;
-
-               sde = &dd->per_sdma[this_idx];
-
-               sde->head_dma = curr_head;
-               curr_head += L1_CACHE_BYTES;
-               phys_offset = (unsigned long)sde->head_dma -
-                             (unsigned long)dd->sdma_heads_dma;
-               sde->head_phys = dd->sdma_heads_phys + phys_offset;
-               init_sdma_regs(sde, per_sdma_credits, idle_cnt);
-       }
-       dd->flags |= HFI1_HAS_SEND_DMA;
-       dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
-       dd->num_sdma = num_engines;
-       if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
-               goto bail;
-       dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
-       return 0;
-
-bail:
-       sdma_clean(dd, num_engines);
-       return -ENOMEM;
-}
-
-/**
- * sdma_all_running() - called when the link goes up
- * @dd: hfi1_devdata
- *
- * This routine moves all engines to the running state.
- */
-void sdma_all_running(struct hfi1_devdata *dd)
-{
-       struct sdma_engine *sde;
-       unsigned int i;
-
-       /* move all engines to running */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e30_go_running);
-       }
-}
-
-/**
- * sdma_all_idle() - called when the link goes down
- * @dd: hfi1_devdata
- *
- * This routine moves all engines to the idle state.
- */
-void sdma_all_idle(struct hfi1_devdata *dd)
-{
-       struct sdma_engine *sde;
-       unsigned int i;
-
-       /* idle all engines */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e70_go_idle);
-       }
-}
-
-/**
- * sdma_start() - called to kick off state processing for all engines
- * @dd: hfi1_devdata
- *
- * This routine is for kicking off the state processing for all required
- * sdma engines.  Interrupts need to be working at this point.
- *
- */
-void sdma_start(struct hfi1_devdata *dd)
-{
-       unsigned i;
-       struct sdma_engine *sde;
-
-       /* kick off the engines state processing */
-       for (i = 0; i < dd->num_sdma; ++i) {
-               sde = &dd->per_sdma[i];
-               sdma_process_event(sde, sdma_event_e10_go_hw_start);
-       }
-}
-
-/**
- * sdma_exit() - used when module is removed
- * @dd: hfi1_devdata
- */
-void sdma_exit(struct hfi1_devdata *dd)
-{
-       unsigned this_idx;
-       struct sdma_engine *sde;
-
-       for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
-                       ++this_idx) {
-               sde = &dd->per_sdma[this_idx];
-               if (!list_empty(&sde->dmawait))
-                       dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
-                                  sde->this_idx);
-               sdma_process_event(sde, sdma_event_e00_go_hw_down);
-
-               del_timer_sync(&sde->err_progress_check_timer);
-
-               /*
-                * This waits for the state machine to exit so it is not
-                * necessary to kill the sdma_sw_clean_up_task to make sure
-                * it is not running.
-                */
-               sdma_finalput(&sde->state);
-       }
-       sdma_clean(dd, dd->num_sdma);
-}
-
-/*
- * unmap the indicated descriptor
- */
-static inline void sdma_unmap_desc(
-       struct hfi1_devdata *dd,
-       struct sdma_desc *descp)
-{
-       switch (sdma_mapping_type(descp)) {
-       case SDMA_MAP_SINGLE:
-               dma_unmap_single(
-                       &dd->pcidev->dev,
-                       sdma_mapping_addr(descp),
-                       sdma_mapping_len(descp),
-                       DMA_TO_DEVICE);
-               break;
-       case SDMA_MAP_PAGE:
-               dma_unmap_page(
-                       &dd->pcidev->dev,
-                       sdma_mapping_addr(descp),
-                       sdma_mapping_len(descp),
-                       DMA_TO_DEVICE);
-               break;
-       }
-}
-
-/*
- * return the mode as indicated by the first
- * descriptor in the tx.
- */
-static inline u8 ahg_mode(struct sdma_txreq *tx)
-{
-       return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
-               >> SDMA_DESC1_HEADER_MODE_SHIFT;
-}
-
-/**
- * sdma_txclean() - clean tx of mappings, descp *kmalloc's
- * @dd: hfi1_devdata for unmapping
- * @tx: tx request to clean
- *
- * This is used in the progress routine to clean the tx or
- * by the ULP to toss an in-process tx build.
- *
- * The code can be called multiple times without issue.
- *
- */
-void sdma_txclean(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx)
-{
-       u16 i;
-
-       if (tx->num_desc) {
-               u8 skip = 0, mode = ahg_mode(tx);
-
-               /* unmap first */
-               sdma_unmap_desc(dd, &tx->descp[0]);
-               /* determine number of AHG descriptors to skip */
-               if (mode > SDMA_AHG_APPLY_UPDATE1)
-                       skip = mode >> 1;
-               for (i = 1 + skip; i < tx->num_desc; i++)
-                       sdma_unmap_desc(dd, &tx->descp[i]);
-               tx->num_desc = 0;
-       }
-       kfree(tx->coalesce_buf);
-       tx->coalesce_buf = NULL;
-       /* kmalloc'ed descp */
-       if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
-               tx->desc_limit = ARRAY_SIZE(tx->descs);
-               kfree(tx->descp);
-       }
-}
-
-static inline u16 sdma_gethead(struct sdma_engine *sde)
-{
-       struct hfi1_devdata *dd = sde->dd;
-       int use_dmahead;
-       u16 hwhead;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-retry:
-       use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
-                                       (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
-       hwhead = use_dmahead ?
-               (u16)le64_to_cpu(*sde->head_dma) :
-               (u16)read_sde_csr(sde, SD(HEAD));
-
-       if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
-               u16 cnt;
-               u16 swtail;
-               u16 swhead;
-               int sane;
-
-               swhead = sde->descq_head & sde->sdma_mask;
-               /* this code is really bad for cache line trading */
-               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-               cnt = sde->descq_cnt;
-
-               if (swhead < swtail)
-                       /* not wrapped */
-                       sane = (hwhead >= swhead) & (hwhead <= swtail);
-               else if (swhead > swtail)
-                       /* wrapped around */
-                       sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
-                               (hwhead <= swtail);
-               else
-                       /* empty */
-                       sane = (hwhead == swhead);
-
-               if (unlikely(!sane)) {
-                       dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
-                                  sde->this_idx,
-                                  use_dmahead ? "dma" : "kreg",
-                                  hwhead, swhead, swtail, cnt);
-                       if (use_dmahead) {
-                               /* try one more time, using csr */
-                               use_dmahead = 0;
-                               goto retry;
-                       }
-                       /* proceed as if no progress */
-                       hwhead = swhead;
-               }
-       }
-       return hwhead;
-}
-
-/*
- * This is called when there are send DMA descriptors that might be
- * available.
- *
- * This is called with head_lock held.
- */
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
-{
-       struct iowait *wait, *nw;
-       struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
-       unsigned i, n = 0, seq;
-       struct sdma_txreq *stx;
-       struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
-                  slashstrip(__FILE__), __LINE__, __func__);
-       dd_dev_err(sde->dd, "avail: %u\n", avail);
-#endif
-
-       do {
-               seq = read_seqbegin(&dev->iowait_lock);
-               if (!list_empty(&sde->dmawait)) {
-                       /* at least one item */
-                       write_seqlock(&dev->iowait_lock);
-                       /* Harvest waiters wanting DMA descriptors */
-                       list_for_each_entry_safe(
-                                       wait,
-                                       nw,
-                                       &sde->dmawait,
-                                       list) {
-                               u16 num_desc = 0;
-
-                               if (!wait->wakeup)
-                                       continue;
-                               if (n == ARRAY_SIZE(waits))
-                                       break;
-                               if (!list_empty(&wait->tx_head)) {
-                                       stx = list_first_entry(
-                                               &wait->tx_head,
-                                               struct sdma_txreq,
-                                               list);
-                                       num_desc = stx->num_desc;
-                               }
-                               if (num_desc > avail)
-                                       break;
-                               avail -= num_desc;
-                               list_del_init(&wait->list);
-                               waits[n++] = wait;
-                       }
-                       write_sequnlock(&dev->iowait_lock);
-                       break;
-               }
-       } while (read_seqretry(&dev->iowait_lock, seq));
-
-       for (i = 0; i < n; i++)
-               waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
-}
-
-/* head_lock must be held */
-static void sdma_make_progress(struct sdma_engine *sde, u64 status)
-{
-       struct sdma_txreq *txp = NULL;
-       int progress = 0;
-       u16 hwhead, swhead;
-       int idle_check_done = 0;
-
-       hwhead = sdma_gethead(sde);
-
-       /* The reason for some of the complexity of this code is that
-        * not all descriptors have corresponding txps.  So, we have to
-        * be able to skip over descs until we wander into the range of
-        * the next txp on the list.
-        */
-
-retry:
-       txp = get_txhead(sde);
-       swhead = sde->descq_head & sde->sdma_mask;
-       trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
-       while (swhead != hwhead) {
-               /* advance head, wrap if needed */
-               swhead = ++sde->descq_head & sde->sdma_mask;
-
-               /* if now past this txp's descs, do the callback */
-               if (txp && txp->next_descq_idx == swhead) {
-                       /* remove from list */
-                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       complete_tx(sde, txp, SDMA_TXREQ_S_OK);
-                       /* see if there is another txp */
-                       txp = get_txhead(sde);
-               }
-               trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
-               progress++;
-       }
-
-       /*
-        * The SDMA idle interrupt is not guaranteed to be ordered with respect
-        * to updates to the the dma_head location in host memory. The head
-        * value read might not be fully up to date. If there are pending
-        * descriptors and the SDMA idle interrupt fired then read from the
-        * CSR SDMA head instead to get the latest value from the hardware.
-        * The hardware SDMA head should be read at most once in this invocation
-        * of sdma_make_progress(..) which is ensured by idle_check_done flag
-        */
-       if ((status & sde->idle_mask) && !idle_check_done) {
-               u16 swtail;
-
-               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-               if (swtail != hwhead) {
-                       hwhead = (u16)read_sde_csr(sde, SD(HEAD));
-                       idle_check_done = 1;
-                       goto retry;
-               }
-       }
-
-       sde->last_status = status;
-       if (progress)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-}
-
-/*
- * sdma_engine_interrupt() - interrupt handler for engine
- * @sde: sdma engine
- * @status: sdma interrupt reason
- *
- * Status is a mask of the 3 possible interrupts for this engine.  It will
- * contain bits _only_ for this SDMA engine.  It will contain at least one
- * bit, it may contain more.
- */
-void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
-{
-       trace_hfi1_sdma_engine_interrupt(sde, status);
-       write_seqlock(&sde->head_lock);
-       sdma_set_desc_cnt(sde, sdma_desct_intr);
-       if (status & sde->idle_mask)
-               sde->idle_int_cnt++;
-       else if (status & sde->progress_mask)
-               sde->progress_int_cnt++;
-       else if (status & sde->int_mask)
-               sde->sdma_int_cnt++;
-       sdma_make_progress(sde, status);
-       write_sequnlock(&sde->head_lock);
-}
-
-/**
- * sdma_engine_error() - error handler for engine
- * @sde: sdma engine
- * @status: sdma interrupt reason
- */
-void sdma_engine_error(struct sdma_engine *sde, u64 status)
-{
-       unsigned long flags;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
-                  sde->this_idx,
-                  (unsigned long long)status,
-                  sdma_state_names[sde->state.current_state]);
-#endif
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-       if (status & ALL_SDMA_ENG_HALT_ERRS)
-               __sdma_process_event(sde, sdma_event_e60_hw_halted);
-       if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
-               dd_dev_err(sde->dd,
-                          "SDMA (%u) engine error: 0x%llx state %s\n",
-                          sde->this_idx,
-                          (unsigned long long)status,
-                          sdma_state_names[sde->state.current_state]);
-               dump_sdma_state(sde);
-       }
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
-{
-       u64 set_senddmactrl = 0;
-       u64 clr_senddmactrl = 0;
-       unsigned long flags;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
-                  sde->this_idx,
-                  (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
-                  (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
-#endif
-
-       if (op & SDMA_SENDCTRL_OP_ENABLE)
-               set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
-
-       if (op & SDMA_SENDCTRL_OP_INTENABLE)
-               set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
-
-       if (op & SDMA_SENDCTRL_OP_HALT)
-               set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
-       else
-               clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
-
-       spin_lock_irqsave(&sde->senddmactrl_lock, flags);
-
-       sde->p_senddmactrl |= set_senddmactrl;
-       sde->p_senddmactrl &= ~clr_senddmactrl;
-
-       if (op & SDMA_SENDCTRL_OP_CLEANUP)
-               write_sde_csr(sde, SD(CTRL),
-                             sde->p_senddmactrl |
-                             SD(CTRL_SDMA_CLEANUP_SMASK));
-       else
-               write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
-
-       spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       sdma_dumpstate(sde);
-#endif
-}
-
-static void sdma_setlengen(struct sdma_engine *sde)
-{
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       /*
-        * Set SendDmaLenGen and clear-then-set the MSB of the generation
-        * count to enable generation checking and load the internal
-        * generation counter.
-        */
-       write_sde_csr(sde, SD(LEN_GEN),
-                     (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
-       write_sde_csr(sde, SD(LEN_GEN),
-                     ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
-                     (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
-}
-
-static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
-{
-       /* Commit writes to memory and advance the tail on the chip */
-       smp_wmb(); /* see get_txhead() */
-       writeq(tail, sde->tail_csr);
-}
-
-/*
- * This is called when changing to state s10_hw_start_up_halt_wait as
- * a result of send buffer errors or send DMA descriptor errors.
- */
-static void sdma_hw_start_up(struct sdma_engine *sde)
-{
-       u64 reg;
-
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       sdma_setlengen(sde);
-       sdma_update_tail(sde, 0); /* Set SendDmaTail */
-       *sde->head_dma = 0;
-
-       reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
-             SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
-       write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
-}
-
-#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
-(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-
-#define SET_STATIC_RATE_CONTROL_SMASK(r) \
-(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
-/*
- * set_sdma_integrity
- *
- * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
- */
-static void set_sdma_integrity(struct sdma_engine *sde)
-{
-       struct hfi1_devdata *dd = sde->dd;
-       u64 reg;
-
-       if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
-               return;
-
-       reg = hfi1_pkt_base_sdma_integrity(dd);
-
-       if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
-               CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
-       else
-               SET_STATIC_RATE_CONTROL_SMASK(reg);
-
-       write_sde_csr(sde, SD(CHECK_ENABLE), reg);
-}
-
-static void init_sdma_regs(
-       struct sdma_engine *sde,
-       u32 credits,
-       uint idle_cnt)
-{
-       u8 opval, opmask;
-#ifdef CONFIG_SDMA_VERBOSITY
-       struct hfi1_devdata *dd = sde->dd;
-
-       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
-                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
-#endif
-
-       write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
-       sdma_setlengen(sde);
-       sdma_update_tail(sde, 0); /* Set SendDmaTail */
-       write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
-       write_sde_csr(sde, SD(DESC_CNT), 0);
-       write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
-       write_sde_csr(sde, SD(MEMORY),
-                     ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
-                     ((u64)(credits * sde->this_idx) <<
-                      SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
-       write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
-       set_sdma_integrity(sde);
-       opmask = OPCODE_CHECK_MASK_DISABLED;
-       opval = OPCODE_CHECK_VAL_DISABLED;
-       write_sde_csr(sde, SD(CHECK_OPCODE),
-                     (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
-                     (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
-}
-
-#ifdef CONFIG_SDMA_VERBOSITY
-
-#define sdma_dumpstate_helper0(reg) do { \
-               csr = read_csr(sde->dd, reg); \
-               dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
-       } while (0)
-
-#define sdma_dumpstate_helper(reg) do { \
-               csr = read_sde_csr(sde, reg); \
-               dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
-                       #reg, sde->this_idx, csr); \
-       } while (0)
-
-#define sdma_dumpstate_helper2(reg) do { \
-               csr = read_csr(sde->dd, reg + (8 * i)); \
-               dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
-                               #reg, i, csr); \
-       } while (0)
-
-void sdma_dumpstate(struct sdma_engine *sde)
-{
-       u64 csr;
-       unsigned i;
-
-       sdma_dumpstate_helper(SD(CTRL));
-       sdma_dumpstate_helper(SD(STATUS));
-       sdma_dumpstate_helper0(SD(ERR_STATUS));
-       sdma_dumpstate_helper0(SD(ERR_MASK));
-       sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
-       sdma_dumpstate_helper(SD(ENG_ERR_MASK));
-
-       for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
-               sdma_dumpstate_helper2(CCE_INT_STATUS);
-               sdma_dumpstate_helper2(CCE_INT_MASK);
-               sdma_dumpstate_helper2(CCE_INT_BLOCKED);
-       }
-
-       sdma_dumpstate_helper(SD(TAIL));
-       sdma_dumpstate_helper(SD(HEAD));
-       sdma_dumpstate_helper(SD(PRIORITY_THLD));
-       sdma_dumpstate_helper(SD(IDLE_CNT));
-       sdma_dumpstate_helper(SD(RELOAD_CNT));
-       sdma_dumpstate_helper(SD(DESC_CNT));
-       sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
-       sdma_dumpstate_helper(SD(MEMORY));
-       sdma_dumpstate_helper0(SD(ENGINES));
-       sdma_dumpstate_helper0(SD(MEM_SIZE));
-       /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
-       sdma_dumpstate_helper(SD(BASE_ADDR));
-       sdma_dumpstate_helper(SD(LEN_GEN));
-       sdma_dumpstate_helper(SD(HEAD_ADDR));
-       sdma_dumpstate_helper(SD(CHECK_ENABLE));
-       sdma_dumpstate_helper(SD(CHECK_VL));
-       sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
-       sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
-       sdma_dumpstate_helper(SD(CHECK_SLID));
-       sdma_dumpstate_helper(SD(CHECK_OPCODE));
-}
-#endif
-
-static void dump_sdma_state(struct sdma_engine *sde)
-{
-       struct hw_sdma_desc *descq;
-       struct hw_sdma_desc *descqp;
-       u64 desc[2];
-       u64 addr;
-       u8 gen;
-       u16 len;
-       u16 head, tail, cnt;
-
-       head = sde->descq_head & sde->sdma_mask;
-       tail = sde->descq_tail & sde->sdma_mask;
-       cnt = sdma_descq_freecnt(sde);
-       descq = sde->descq;
-
-       dd_dev_err(sde->dd,
-                  "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
-                  sde->this_idx, head, tail, cnt,
-                  !list_empty(&sde->flushlist));
-
-       /* print info for each entry in the descriptor queue */
-       while (head != tail) {
-               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
-
-               descqp = &sde->descq[head];
-               desc[0] = le64_to_cpu(descqp->qw[0]);
-               desc[1] = le64_to_cpu(descqp->qw[1]);
-               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
-                               'H' : '-';
-               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
-                       & SDMA_DESC0_PHY_ADDR_MASK;
-               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
-                       & SDMA_DESC1_GENERATION_MASK;
-               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
-                       & SDMA_DESC0_BYTE_COUNT_MASK;
-               dd_dev_err(sde->dd,
-                          "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
-                          head, flags, addr, gen, len);
-               dd_dev_err(sde->dd,
-                          "\tdesc0:0x%016llx desc1 0x%016llx\n",
-                          desc[0], desc[1]);
-               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
-                       dd_dev_err(sde->dd,
-                                  "\taidx: %u amode: %u alen: %u\n",
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
-                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
-                                       SDMA_DESC1_HEADER_MODE_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_DWS_SMASK) >>
-                                       SDMA_DESC1_HEADER_DWS_SHIFT));
-               head++;
-               head &= sde->sdma_mask;
-       }
-}
-
-#define SDE_FMT \
-       "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
-/**
- * sdma_seqfile_dump_sde() - debugfs dump of sde
- * @s: seq file
- * @sde: send dma engine to dump
- *
- * This routine dumps the sde to the indicated seq file.
- */
-void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
-{
-       u16 head, tail;
-       struct hw_sdma_desc *descqp;
-       u64 desc[2];
-       u64 addr;
-       u8 gen;
-       u16 len;
-
-       head = sde->descq_head & sde->sdma_mask;
-       tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
-       seq_printf(s, SDE_FMT, sde->this_idx,
-                  sde->cpu,
-                  sdma_state_name(sde->state.current_state),
-                  (unsigned long long)read_sde_csr(sde, SD(CTRL)),
-                  (unsigned long long)read_sde_csr(sde, SD(STATUS)),
-                  (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
-                  (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
-                  (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
-                  (unsigned long long)le64_to_cpu(*sde->head_dma),
-                  (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
-                  (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
-                  (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
-                  (unsigned long long)sde->last_status,
-                  (unsigned long long)sde->ahg_bits,
-                  sde->tx_tail,
-                  sde->tx_head,
-                  sde->descq_tail,
-                  sde->descq_head,
-                  !list_empty(&sde->flushlist),
-                  sde->descq_full_count,
-                  (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
-
-       /* print info for each entry in the descriptor queue */
-       while (head != tail) {
-               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
-
-               descqp = &sde->descq[head];
-               desc[0] = le64_to_cpu(descqp->qw[0]);
-               desc[1] = le64_to_cpu(descqp->qw[1]);
-               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
-                               'H' : '-';
-               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
-                       & SDMA_DESC0_PHY_ADDR_MASK;
-               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
-                       & SDMA_DESC1_GENERATION_MASK;
-               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
-                       & SDMA_DESC0_BYTE_COUNT_MASK;
-               seq_printf(s,
-                          "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
-                          head, flags, addr, gen, len);
-               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
-                       seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_INDEX_SMASK) >>
-                                       SDMA_DESC1_HEADER_INDEX_SHIFT),
-                                  (u8)((desc[1] &
-                                        SDMA_DESC1_HEADER_MODE_SMASK) >>
-                                       SDMA_DESC1_HEADER_MODE_SHIFT));
-               head = (head + 1) & sde->sdma_mask;
-       }
-}
-
-/*
- * add the generation number into
- * the qw1 and return
- */
-static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
-{
-       u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
-
-       qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
-       qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
-                       << SDMA_DESC1_GENERATION_SHIFT;
-       return qw1;
-}
-
-/*
- * This routine submits the indicated tx
- *
- * Space has already been guaranteed and
- * tail side of ring is locked.
- *
- * The hardware tail update is done
- * in the caller and that is facilitated
- * by returning the new tail.
- *
- * There is special case logic for ahg
- * to not add the generation number for
- * up to 2 descriptors that follow the
- * first descriptor.
- *
- */
-static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
-{
-       int i;
-       u16 tail;
-       struct sdma_desc *descp = tx->descp;
-       u8 skip = 0, mode = ahg_mode(tx);
-
-       tail = sde->descq_tail & sde->sdma_mask;
-       sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
-       sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
-       trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
-                                  tail, &sde->descq[tail]);
-       tail = ++sde->descq_tail & sde->sdma_mask;
-       descp++;
-       if (mode > SDMA_AHG_APPLY_UPDATE1)
-               skip = mode >> 1;
-       for (i = 1; i < tx->num_desc; i++, descp++) {
-               u64 qw1;
-
-               sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
-               if (skip) {
-                       /* edits don't have generation */
-                       qw1 = descp->qw[1];
-                       skip--;
-               } else {
-                       /* replace generation with real one for non-edits */
-                       qw1 = add_gen(sde, descp->qw[1]);
-               }
-               sde->descq[tail].qw[1] = cpu_to_le64(qw1);
-               trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
-                                          tail, &sde->descq[tail]);
-               tail = ++sde->descq_tail & sde->sdma_mask;
-       }
-       tx->next_descq_idx = tail;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       tx->sn = sde->tail_sn++;
-       trace_hfi1_sdma_in_sn(sde, tx->sn);
-       WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
-#endif
-       sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
-       sde->desc_avail -= tx->num_desc;
-       return tail;
-}
-
-/*
- * Check for progress
- */
-static int sdma_check_progress(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *tx)
-{
-       int ret;
-
-       sde->desc_avail = sdma_descq_freecnt(sde);
-       if (tx->num_desc <= sde->desc_avail)
-               return -EAGAIN;
-       /* pulse the head_lock */
-       if (wait && wait->sleep) {
-               unsigned seq;
-
-               seq = raw_seqcount_begin(
-                       (const seqcount_t *)&sde->head_lock.seqcount);
-               ret = wait->sleep(sde, wait, tx, seq);
-               if (ret == -EAGAIN)
-                       sde->desc_avail = sdma_descq_freecnt(sde);
-       } else {
-               ret = -EBUSY;
-       }
-       return ret;
-}
-
-/**
- * sdma_send_txreq() - submit a tx req to ring
- * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
- * @tx: sdma_txreq to submit
- *
- * The call submits the tx into the ring.  If a iowait structure is non-NULL
- * the packet will be queued to the list in wait.
- *
- * Return:
- * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
- * ring (wait == NULL)
- * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
- */
-int sdma_send_txreq(struct sdma_engine *sde,
-                   struct iowait *wait,
-                   struct sdma_txreq *tx)
-{
-       int ret = 0;
-       u16 tail;
-       unsigned long flags;
-
-       /* user should have supplied entire packet */
-       if (unlikely(tx->tlen))
-               return -EINVAL;
-       tx->wait = wait;
-       spin_lock_irqsave(&sde->tail_lock, flags);
-retry:
-       if (unlikely(!__sdma_running(sde)))
-               goto unlock_noconn;
-       if (unlikely(tx->num_desc > sde->desc_avail))
-               goto nodesc;
-       tail = submit_tx(sde, tx);
-       if (wait)
-               iowait_sdma_inc(wait);
-       sdma_update_tail(sde, tail);
-unlock:
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-       return ret;
-unlock_noconn:
-       if (wait)
-               iowait_sdma_inc(wait);
-       tx->next_descq_idx = 0;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       tx->sn = sde->tail_sn++;
-       trace_hfi1_sdma_in_sn(sde, tx->sn);
-#endif
-       spin_lock(&sde->flushlist_lock);
-       list_add_tail(&tx->list, &sde->flushlist);
-       spin_unlock(&sde->flushlist_lock);
-       if (wait) {
-               wait->tx_count++;
-               wait->count += tx->num_desc;
-       }
-       schedule_work(&sde->flush_worker);
-       ret = -ECOMM;
-       goto unlock;
-nodesc:
-       ret = sdma_check_progress(sde, wait, tx);
-       if (ret == -EAGAIN) {
-               ret = 0;
-               goto retry;
-       }
-       sde->descq_full_count++;
-       goto unlock;
-}
-
-/**
- * sdma_send_txlist() - submit a list of tx req to ring
- * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
- * @tx_list: list of sdma_txreqs to submit
- *
- * The call submits the list into the ring.
- *
- * If the iowait structure is non-NULL and not equal to the iowait list
- * the unprocessed part of the list  will be appended to the list in wait.
- *
- * In all cases, the tx_list will be updated so the head of the tx_list is
- * the list of descriptors that have yet to be transmitted.
- *
- * The intent of this call is to provide a more efficient
- * way of submitting multiple packets to SDMA while holding the tail
- * side locking.
- *
- * Return:
- * > 0 - Success (value is number of sdma_txreq's submitted),
- * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
- * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
- */
-int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
-                    struct list_head *tx_list)
-{
-       struct sdma_txreq *tx, *tx_next;
-       int ret = 0;
-       unsigned long flags;
-       u16 tail = INVALID_TAIL;
-       int count = 0;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-retry:
-       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-               tx->wait = wait;
-               if (unlikely(!__sdma_running(sde)))
-                       goto unlock_noconn;
-               if (unlikely(tx->num_desc > sde->desc_avail))
-                       goto nodesc;
-               if (unlikely(tx->tlen)) {
-                       ret = -EINVAL;
-                       goto update_tail;
-               }
-               list_del_init(&tx->list);
-               tail = submit_tx(sde, tx);
-               count++;
-               if (tail != INVALID_TAIL &&
-                   (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
-                       sdma_update_tail(sde, tail);
-                       tail = INVALID_TAIL;
-               }
-       }
-update_tail:
-       if (wait)
-               iowait_sdma_add(wait, count);
-       if (tail != INVALID_TAIL)
-               sdma_update_tail(sde, tail);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-       return ret == 0 ? count : ret;
-unlock_noconn:
-       spin_lock(&sde->flushlist_lock);
-       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-               tx->wait = wait;
-               list_del_init(&tx->list);
-               if (wait)
-                       iowait_sdma_inc(wait);
-               tx->next_descq_idx = 0;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-               tx->sn = sde->tail_sn++;
-               trace_hfi1_sdma_in_sn(sde, tx->sn);
-#endif
-               list_add_tail(&tx->list, &sde->flushlist);
-               if (wait) {
-                       wait->tx_count++;
-                       wait->count += tx->num_desc;
-               }
-       }
-       spin_unlock(&sde->flushlist_lock);
-       schedule_work(&sde->flush_worker);
-       ret = -ECOMM;
-       goto update_tail;
-nodesc:
-       ret = sdma_check_progress(sde, wait, tx);
-       if (ret == -EAGAIN) {
-               ret = 0;
-               goto retry;
-       }
-       sde->descq_full_count++;
-       goto update_tail;
-}
-
-static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&sde->tail_lock, flags);
-       write_seqlock(&sde->head_lock);
-
-       __sdma_process_event(sde, event);
-
-       if (sde->state.current_state == sdma_state_s99_running)
-               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
-
-       write_sequnlock(&sde->head_lock);
-       spin_unlock_irqrestore(&sde->tail_lock, flags);
-}
-
-static void __sdma_process_event(struct sdma_engine *sde,
-                                enum sdma_events event)
-{
-       struct sdma_state *ss = &sde->state;
-       int need_progress = 0;
-
-       /* CONFIG SDMA temporary */
-#ifdef CONFIG_SDMA_VERBOSITY
-       dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
-                  sdma_state_names[ss->current_state],
-                  sdma_event_names[event]);
-#endif
-
-       switch (ss->current_state) {
-       case sdma_state_s00_hw_down:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       break;
-               case sdma_event_e30_go_running:
-                       /*
-                        * If down, but running requested (usually result
-                        * of link up, then we need to start up.
-                        * This can happen when hw down is requested while
-                        * bringing the link up with traffic active on
-                        * 7220, e.g.
-                        */
-                       ss->go_s99_running = 1;
-                       /* fall through and start dma engine */
-               case sdma_event_e10_go_hw_start:
-                       /* This reference means the state machine is started */
-                       sdma_get(&sde->state);
-                       sdma_set_state(sde,
-                                      sdma_state_s10_hw_start_up_halt_wait);
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s10_hw_start_up_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde,
-                                      sdma_state_s15_hw_start_up_clean_wait);
-                       sdma_start_hw_clean_up(sde);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s15_hw_start_up_clean_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s20_idle:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       sdma_sw_tear_down(sde);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       sdma_set_state(sde, sdma_state_s99_running);
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       break;
-               case sdma_event_e85_link_down:
-                       /* fall through */
-               case sdma_event_e80_hw_freeze:
-                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s30_sw_clean_up_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
-                       sdma_start_hw_clean_up(sde);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s40_hw_clean_up_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s50_hw_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s60_idle_halt_wait:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s80_hw_freeze:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s82_freeze_sw_clean:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       ss->go_s99_running = 1;
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       /* notify caller this engine is done cleaning */
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       break;
-               case sdma_event_e70_go_idle:
-                       ss->go_s99_running = 0;
-                       break;
-               case sdma_event_e80_hw_freeze:
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       sdma_hw_start_up(sde);
-                       sdma_set_state(sde, ss->go_s99_running ?
-                                      sdma_state_s99_running :
-                                      sdma_state_s20_idle);
-                       break;
-               case sdma_event_e85_link_down:
-                       break;
-               case sdma_event_e90_sw_halted:
-                       break;
-               }
-               break;
-
-       case sdma_state_s99_running:
-               switch (event) {
-               case sdma_event_e00_go_hw_down:
-                       sdma_set_state(sde, sdma_state_s00_hw_down);
-                       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
-                       break;
-               case sdma_event_e10_go_hw_start:
-                       break;
-               case sdma_event_e15_hw_halt_done:
-                       break;
-               case sdma_event_e25_hw_clean_up_done:
-                       break;
-               case sdma_event_e30_go_running:
-                       break;
-               case sdma_event_e40_sw_cleaned:
-                       break;
-               case sdma_event_e50_hw_cleaned:
-                       break;
-               case sdma_event_e60_hw_halted:
-                       need_progress = 1;
-                       sdma_err_progress_check_schedule(sde);
-               case sdma_event_e90_sw_halted:
-                       /*
-                       * SW initiated halt does not perform engines
-                       * progress check
-                       */
-                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
-                       schedule_work(&sde->err_halt_worker);
-                       break;
-               case sdma_event_e70_go_idle:
-                       sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
-                       break;
-               case sdma_event_e85_link_down:
-                       ss->go_s99_running = 0;
-                       /* fall through */
-               case sdma_event_e80_hw_freeze:
-                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
-                       atomic_dec(&sde->dd->sdma_unfreeze_count);
-                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
-                       break;
-               case sdma_event_e81_hw_frozen:
-                       break;
-               case sdma_event_e82_hw_unfreeze:
-                       break;
-               }
-               break;
-       }
-
-       ss->last_event = event;
-       if (need_progress)
-               sdma_make_progress(sde, 0);
-}
-
-/*
- * _extend_sdma_tx_descs() - helper to extend txreq
- *
- * This is called once the initial nominal allocation
- * of descriptors in the sdma_txreq is exhausted.
- *
- * The code will bump the allocation up to the max
- * of MAX_DESC (64) descriptors. There doesn't seem
- * much point in an interim step. The last descriptor
- * is reserved for coalesce buffer in order to support
- * cases where input packet has >MAX_DESC iovecs.
- *
- */
-static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
-{
-       int i;
-
-       /* Handle last descriptor */
-       if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
-               /* if tlen is 0, it is for padding, release last descriptor */
-               if (!tx->tlen) {
-                       tx->desc_limit = MAX_DESC;
-               } else if (!tx->coalesce_buf) {
-                       /* allocate coalesce buffer with space for padding */
-                       tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
-                                                  GFP_ATOMIC);
-                       if (!tx->coalesce_buf)
-                               goto enomem;
-                       tx->coalesce_idx = 0;
-               }
-               return 0;
-       }
-
-       if (unlikely(tx->num_desc == MAX_DESC))
-               goto enomem;
-
-       tx->descp = kmalloc_array(
-                       MAX_DESC,
-                       sizeof(struct sdma_desc),
-                       GFP_ATOMIC);
-       if (!tx->descp)
-               goto enomem;
-
-       /* reserve last descriptor for coalescing */
-       tx->desc_limit = MAX_DESC - 1;
-       /* copy ones already built */
-       for (i = 0; i < tx->num_desc; i++)
-               tx->descp[i] = tx->descs[i];
-       return 0;
-enomem:
-       sdma_txclean(dd, tx);
-       return -ENOMEM;
-}
-
-/*
- * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
- *
- * This is called once the initial nominal allocation of descriptors
- * in the sdma_txreq is exhausted.
- *
- * This function calls _extend_sdma_tx_descs to extend or allocate
- * coalesce buffer. If there is a allocated coalesce buffer, it will
- * copy the input packet data into the coalesce buffer. It also adds
- * coalesce buffer descriptor once when whole packet is received.
- *
- * Return:
- * <0 - error
- * 0 - coalescing, don't populate descriptor
- * 1 - continue with populating descriptor
- */
-int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
-                          int type, void *kvaddr, struct page *page,
-                          unsigned long offset, u16 len)
-{
-       int pad_len, rval;
-       dma_addr_t addr;
-
-       rval = _extend_sdma_tx_descs(dd, tx);
-       if (rval) {
-               sdma_txclean(dd, tx);
-               return rval;
-       }
-
-       /* If coalesce buffer is allocated, copy data into it */
-       if (tx->coalesce_buf) {
-               if (type == SDMA_MAP_NONE) {
-                       sdma_txclean(dd, tx);
-                       return -EINVAL;
-               }
-
-               if (type == SDMA_MAP_PAGE) {
-                       kvaddr = kmap(page);
-                       kvaddr += offset;
-               } else if (WARN_ON(!kvaddr)) {
-                       sdma_txclean(dd, tx);
-                       return -EINVAL;
-               }
-
-               memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
-               tx->coalesce_idx += len;
-               if (type == SDMA_MAP_PAGE)
-                       kunmap(page);
-
-               /* If there is more data, return */
-               if (tx->tlen - tx->coalesce_idx)
-                       return 0;
-
-               /* Whole packet is received; add any padding */
-               pad_len = tx->packet_len & (sizeof(u32) - 1);
-               if (pad_len) {
-                       pad_len = sizeof(u32) - pad_len;
-                       memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
-                       /* padding is taken care of for coalescing case */
-                       tx->packet_len += pad_len;
-                       tx->tlen += pad_len;
-               }
-
-               /* dma map the coalesce buffer */
-               addr = dma_map_single(&dd->pcidev->dev,
-                                     tx->coalesce_buf,
-                                     tx->tlen,
-                                     DMA_TO_DEVICE);
-
-               if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-                       sdma_txclean(dd, tx);
-                       return -ENOSPC;
-               }
-
-               /* Add descriptor for coalesce buffer */
-               tx->desc_limit = MAX_DESC;
-               return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
-                                        addr, tx->tlen);
-       }
-
-       return 1;
-}
-
-/* Update sdes when the lmc changes */
-void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
-{
-       struct sdma_engine *sde;
-       int i;
-       u64 sreg;
-
-       sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
-               SD(CHECK_SLID_MASK_SHIFT)) |
-               (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
-               SD(CHECK_SLID_VALUE_SHIFT));
-
-       for (i = 0; i < dd->num_sdma; i++) {
-               hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
-                         i, (u32)sreg);
-               sde = &dd->per_sdma[i];
-               write_sde_csr(sde, SD(CHECK_SLID), sreg);
-       }
-}
-
-/* tx not dword sized - pad */
-int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
-{
-       int rval = 0;
-
-       tx->num_desc++;
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = _extend_sdma_tx_descs(dd, tx);
-               if (rval) {
-                       sdma_txclean(dd, tx);
-                       return rval;
-               }
-       }
-       /* finish the one just added */
-       make_tx_sdma_desc(
-               tx,
-               SDMA_MAP_NONE,
-               dd->sdma_pad_phys,
-               sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
-       _sdma_close_tx(dd, tx);
-       return rval;
-}
-
-/*
- * Add ahg to the sdma_txreq
- *
- * The logic will consume up to 3
- * descriptors at the beginning of
- * sdma_txreq.
- */
-void _sdma_txreq_ahgadd(
-       struct sdma_txreq *tx,
-       u8 num_ahg,
-       u8 ahg_entry,
-       u32 *ahg,
-       u8 ahg_hlen)
-{
-       u32 i, shift = 0, desc = 0;
-       u8 mode;
-
-       WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
-       /* compute mode */
-       if (num_ahg == 1)
-               mode = SDMA_AHG_APPLY_UPDATE1;
-       else if (num_ahg <= 5)
-               mode = SDMA_AHG_APPLY_UPDATE2;
-       else
-               mode = SDMA_AHG_APPLY_UPDATE3;
-       tx->num_desc++;
-       /* initialize to consumed descriptors to zero */
-       switch (mode) {
-       case SDMA_AHG_APPLY_UPDATE3:
-               tx->num_desc++;
-               tx->descs[2].qw[0] = 0;
-               tx->descs[2].qw[1] = 0;
-               /* FALLTHROUGH */
-       case SDMA_AHG_APPLY_UPDATE2:
-               tx->num_desc++;
-               tx->descs[1].qw[0] = 0;
-               tx->descs[1].qw[1] = 0;
-               break;
-       }
-       ahg_hlen >>= 2;
-       tx->descs[0].qw[1] |=
-               (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
-                       << SDMA_DESC1_HEADER_INDEX_SHIFT) |
-               (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
-                       << SDMA_DESC1_HEADER_DWS_SHIFT) |
-               (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
-                       << SDMA_DESC1_HEADER_MODE_SHIFT) |
-               (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
-                       << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
-       for (i = 0; i < (num_ahg - 1); i++) {
-               if (!shift && !(i & 2))
-                       desc++;
-               tx->descs[desc].qw[!!(i & 2)] |=
-                       (((u64)ahg[i + 1])
-                               << shift);
-               shift = (shift + 32) & 63;
-       }
-}
-
-/**
- * sdma_ahg_alloc - allocate an AHG entry
- * @sde: engine to allocate from
- *
- * Return:
- * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
- * -ENOSPC if an entry is not available
- */
-int sdma_ahg_alloc(struct sdma_engine *sde)
-{
-       int nr;
-       int oldbit;
-
-       if (!sde) {
-               trace_hfi1_ahg_allocate(sde, -EINVAL);
-               return -EINVAL;
-       }
-       while (1) {
-               nr = ffz(ACCESS_ONCE(sde->ahg_bits));
-               if (nr > 31) {
-                       trace_hfi1_ahg_allocate(sde, -ENOSPC);
-                       return -ENOSPC;
-               }
-               oldbit = test_and_set_bit(nr, &sde->ahg_bits);
-               if (!oldbit)
-                       break;
-               cpu_relax();
-       }
-       trace_hfi1_ahg_allocate(sde, nr);
-       return nr;
-}
-
-/**
- * sdma_ahg_free - free an AHG entry
- * @sde: engine to return AHG entry
- * @ahg_index: index to free
- *
- * This routine frees the indicate AHG entry.
- */
-void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
-{
-       if (!sde)
-               return;
-       trace_hfi1_ahg_deallocate(sde, ahg_index);
-       if (ahg_index < 0 || ahg_index > 31)
-               return;
-       clear_bit(ahg_index, &sde->ahg_bits);
-}
-
-/*
- * SPC freeze handling for SDMA engines.  Called when the driver knows
- * the SPC is going into a freeze but before the freeze is fully
- * settled.  Generally an error interrupt.
- *
- * This event will pull the engine out of running so no more entries can be
- * added to the engine's queue.
- */
-void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
-{
-       int i;
-       enum sdma_events event = link_down ? sdma_event_e85_link_down :
-                                            sdma_event_e80_hw_freeze;
-
-       /* set up the wait but do not wait here */
-       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
-
-       /* tell all engines to stop running and wait */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i], event);
-
-       /* sdma_freeze() will wait for all engines to have stopped */
-}
-
-/*
- * SPC freeze handling for SDMA engines.  Called when the driver knows
- * the SPC is fully frozen.
- */
-void sdma_freeze(struct hfi1_devdata *dd)
-{
-       int i;
-       int ret;
-
-       /*
-        * Make sure all engines have moved out of the running state before
-        * continuing.
-        */
-       ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
-                                      atomic_read(&dd->sdma_unfreeze_count) <=
-                                      0);
-       /* interrupted or count is negative, then unloading - just exit */
-       if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
-               return;
-
-       /* set up the count for the next wait */
-       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
-
-       /* tell all engines that the SPC is frozen, they can start cleaning */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
-
-       /*
-        * Wait for everyone to finish software clean before exiting.  The
-        * software clean will read engine CSRs, so must be completed before
-        * the next step, which will clear the engine CSRs.
-        */
-       (void)wait_event_interruptible(dd->sdma_unfreeze_wq,
-                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
-       /* no need to check results - done no matter what */
-}
-
-/*
- * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
- *
- * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
- * that is left is a software clean.  We could do it after the SPC is fully
- * frozen, but then we'd have to add another state to wait for the unfreeze.
- * Instead, just defer the software clean until the unfreeze step.
- */
-void sdma_unfreeze(struct hfi1_devdata *dd)
-{
-       int i;
-
-       /* tell all engines start freeze clean up */
-       for (i = 0; i < dd->num_sdma; i++)
-               sdma_process_event(&dd->per_sdma[i],
-                                  sdma_event_e82_hw_unfreeze);
-}
-
-/**
- * _sdma_engine_progress_schedule() - schedule progress on engine
- * @sde: sdma_engine to schedule progress
- *
- */
-void _sdma_engine_progress_schedule(
-       struct sdma_engine *sde)
-{
-       trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
-       /* assume we have selected a good cpu */
-       write_csr(sde->dd,
-                 CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
-                 sde->progress_mask);
-}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
deleted file mode 100644 (file)
index 8f50c99..0000000
+++ /dev/null
@@ -1,1082 +0,0 @@
-#ifndef _HFI1_SDMA_H
-#define _HFI1_SDMA_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-#include <linux/workqueue.h>
-#include <linux/rculist.h>
-
-#include "hfi.h"
-#include "verbs.h"
-#include "sdma_txreq.h"
-
-/* Hardware limit */
-#define MAX_DESC 64
-/* Hardware limit for SDMA packet size */
-#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
-
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
-#define SDMA_MAP_NONE          0
-#define SDMA_MAP_SINGLE        1
-#define SDMA_MAP_PAGE          2
-
-#define SDMA_AHG_VALUE_MASK          0xffff
-#define SDMA_AHG_VALUE_SHIFT         0
-#define SDMA_AHG_INDEX_MASK          0xf
-#define SDMA_AHG_INDEX_SHIFT         16
-#define SDMA_AHG_FIELD_LEN_MASK      0xf
-#define SDMA_AHG_FIELD_LEN_SHIFT     20
-#define SDMA_AHG_FIELD_START_MASK    0x1f
-#define SDMA_AHG_FIELD_START_SHIFT   24
-#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
-#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
-
-/* AHG modes */
-
-/*
- * Be aware the ordering and values
- * for SDMA_AHG_APPLY_UPDATE[123]
- * are assumed in generating a skip
- * count in submit_tx() in sdma.c
- */
-#define SDMA_AHG_NO_AHG              0
-#define SDMA_AHG_COPY                1
-#define SDMA_AHG_APPLY_UPDATE1       2
-#define SDMA_AHG_APPLY_UPDATE2       3
-#define SDMA_AHG_APPLY_UPDATE3       4
-
-/*
- * Bits defined in the send DMA descriptor.
- */
-#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
-#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
-#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
-#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
-#define SDMA_DESC0_BYTE_COUNT_MASK \
-       ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
-#define SDMA_DESC0_BYTE_COUNT_SMASK \
-       (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
-#define SDMA_DESC0_PHY_ADDR_SHIFT       0
-#define SDMA_DESC0_PHY_ADDR_WIDTH       48
-#define SDMA_DESC0_PHY_ADDR_MASK \
-       ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
-#define SDMA_DESC0_PHY_ADDR_SMASK \
-       (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
-
-#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
-#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
-#define SDMA_DESC1_HEADER_UPDATE1_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
-       (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
-#define SDMA_DESC1_HEADER_MODE_SHIFT    13
-#define SDMA_DESC1_HEADER_MODE_WIDTH    3
-#define SDMA_DESC1_HEADER_MODE_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_MODE_SMASK \
-       (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
-#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
-#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
-#define SDMA_DESC1_HEADER_INDEX_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_INDEX_SMASK \
-       (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
-#define SDMA_DESC1_HEADER_DWS_SHIFT     4
-#define SDMA_DESC1_HEADER_DWS_WIDTH     4
-#define SDMA_DESC1_HEADER_DWS_MASK \
-       ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
-#define SDMA_DESC1_HEADER_DWS_SMASK \
-       (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
-#define SDMA_DESC1_GENERATION_SHIFT     2
-#define SDMA_DESC1_GENERATION_WIDTH     2
-#define SDMA_DESC1_GENERATION_MASK \
-       ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
-#define SDMA_DESC1_GENERATION_SMASK \
-       (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
-#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
-#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
-
-enum sdma_states {
-       sdma_state_s00_hw_down,
-       sdma_state_s10_hw_start_up_halt_wait,
-       sdma_state_s15_hw_start_up_clean_wait,
-       sdma_state_s20_idle,
-       sdma_state_s30_sw_clean_up_wait,
-       sdma_state_s40_hw_clean_up_wait,
-       sdma_state_s50_hw_halt_wait,
-       sdma_state_s60_idle_halt_wait,
-       sdma_state_s80_hw_freeze,
-       sdma_state_s82_freeze_sw_clean,
-       sdma_state_s99_running,
-};
-
-enum sdma_events {
-       sdma_event_e00_go_hw_down,
-       sdma_event_e10_go_hw_start,
-       sdma_event_e15_hw_halt_done,
-       sdma_event_e25_hw_clean_up_done,
-       sdma_event_e30_go_running,
-       sdma_event_e40_sw_cleaned,
-       sdma_event_e50_hw_cleaned,
-       sdma_event_e60_hw_halted,
-       sdma_event_e70_go_idle,
-       sdma_event_e80_hw_freeze,
-       sdma_event_e81_hw_frozen,
-       sdma_event_e82_hw_unfreeze,
-       sdma_event_e85_link_down,
-       sdma_event_e90_sw_halted,
-};
-
-struct sdma_set_state_action {
-       unsigned op_enable:1;
-       unsigned op_intenable:1;
-       unsigned op_halt:1;
-       unsigned op_cleanup:1;
-       unsigned go_s99_running_tofalse:1;
-       unsigned go_s99_running_totrue:1;
-};
-
-struct sdma_state {
-       struct kref          kref;
-       struct completion    comp;
-       enum sdma_states current_state;
-       unsigned             current_op;
-       unsigned             go_s99_running;
-       /* debugging/development */
-       enum sdma_states previous_state;
-       unsigned             previous_op;
-       enum sdma_events last_event;
-};
-
-/**
- * DOC: sdma exported routines
- *
- * These sdma routines fit into three categories:
- * - The SDMA API for building and submitting packets
- *   to the ring
- *
- * - Initialization and tear down routines to buildup
- *   and tear down SDMA
- *
- * - ISR entrances to handle interrupts, state changes
- *   and errors
- */
-
-/**
- * DOC: sdma PSM/verbs API
- *
- * The sdma API is designed to be used by both PSM
- * and verbs to supply packets to the SDMA ring.
- *
- * The usage of the API is as follows:
- *
- * Embed a struct iowait in the QP or
- * PQ.  The iowait should be initialized with a
- * call to iowait_init().
- *
- * The user of the API should create an allocation method
- * for their version of the txreq. slabs, pre-allocated lists,
- * and dma pools can be used.  Once the user's overload of
- * the sdma_txreq has been allocated, the sdma_txreq member
- * must be initialized with sdma_txinit() or sdma_txinit_ahg().
- *
- * The txreq must be declared with the sdma_txreq first.
- *
- * The tx request, once initialized,  is manipulated with calls to
- * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
- * for each disjoint memory location.  It is the user's responsibility
- * to understand the packet boundaries and page boundaries to do the
- * appropriate number of sdma_txadd_* calls..  The user
- * must be prepared to deal with failures from these routines due to
- * either memory allocation or dma_mapping failures.
- *
- * The mapping specifics for each memory location are recorded
- * in the tx. Memory locations added with sdma_txadd_page()
- * and sdma_txadd_kvaddr() are automatically mapped when added
- * to the tx and nmapped as part of the progress processing in the
- * SDMA interrupt handling.
- *
- * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
- * tx.   An example of a use case would be a pre-allocated
- * set of headers allocated via dma_pool_alloc() or
- * dma_alloc_coherent().  For these memory locations, it
- * is the responsibility of the user to handle that unmapping.
- * (This would usually be at an unload or job termination.)
- *
- * The routine sdma_send_txreq() is used to submit
- * a tx to the ring after the appropriate number of
- * sdma_txadd_* have been done.
- *
- * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
- * can be used to submit a list of packets.
- *
- * The user is free to use the link overhead in the struct sdma_txreq as
- * long as the tx isn't in flight.
- *
- * The extreme degenerate case of the number of descriptors
- * exceeding the ring size is automatically handled as
- * memory locations are added.  An overflow of the descriptor
- * array that is part of the sdma_txreq is also automatically
- * handled.
- *
- */
-
-/**
- * DOC: Infrastructure calls
- *
- * sdma_init() is used to initialize data structures and
- * CSRs for the desired number of SDMA engines.
- *
- * sdma_start() is used to kick the SDMA engines initialized
- * with sdma_init().   Interrupts must be enabled at this
- * point since aspects of the state machine are interrupt
- * driven.
- *
- * sdma_engine_error() and sdma_engine_interrupt() are
- * entrances for interrupts.
- *
- * sdma_map_init() is for the management of the mapping
- * table when the number of vls is changed.
- *
- */
-
-/*
- * struct hw_sdma_desc - raw 128 bit SDMA descriptor
- *
- * This is the raw descriptor in the SDMA ring
- */
-struct hw_sdma_desc {
-       /* private:  don't use directly */
-       __le64 qw[2];
-};
-
-/**
- * struct sdma_engine - Data pertaining to each SDMA engine.
- * @dd: a back-pointer to the device data
- * @ppd: per port back-pointer
- * @imask: mask for irq manipulation
- * @idle_mask: mask for determining if an interrupt is due to sdma_idle
- *
- * This structure has the state for each sdma_engine.
- *
- * Accessing to non public fields are not supported
- * since the private members are subject to change.
- */
-struct sdma_engine {
-       /* read mostly */
-       struct hfi1_devdata *dd;
-       struct hfi1_pportdata *ppd;
-       /* private: */
-       void __iomem *tail_csr;
-       u64 imask;                      /* clear interrupt mask */
-       u64 idle_mask;
-       u64 progress_mask;
-       u64 int_mask;
-       /* private: */
-       volatile __le64      *head_dma; /* DMA'ed by chip */
-       /* private: */
-       dma_addr_t            head_phys;
-       /* private: */
-       struct hw_sdma_desc *descq;
-       /* private: */
-       unsigned descq_full_count;
-       struct sdma_txreq **tx_ring;
-       /* private: */
-       dma_addr_t            descq_phys;
-       /* private */
-       u32 sdma_mask;
-       /* private */
-       struct sdma_state state;
-       /* private */
-       int cpu;
-       /* private: */
-       u8 sdma_shift;
-       /* private: */
-       u8 this_idx; /* zero relative engine */
-       /* protect changes to senddmactrl shadow */
-       spinlock_t senddmactrl_lock;
-       /* private: */
-       u64 p_senddmactrl;              /* shadow per-engine SendDmaCtrl */
-
-       /* read/write using tail_lock */
-       spinlock_t            tail_lock ____cacheline_aligned_in_smp;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       /* private: */
-       u64                   tail_sn;
-#endif
-       /* private: */
-       u32                   descq_tail;
-       /* private: */
-       unsigned long         ahg_bits;
-       /* private: */
-       u16                   desc_avail;
-       /* private: */
-       u16                   tx_tail;
-       /* private: */
-       u16 descq_cnt;
-
-       /* read/write using head_lock */
-       /* private: */
-       seqlock_t            head_lock ____cacheline_aligned_in_smp;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       /* private: */
-       u64                   head_sn;
-#endif
-       /* private: */
-       u32                   descq_head;
-       /* private: */
-       u16                   tx_head;
-       /* private: */
-       u64                   last_status;
-       /* private */
-       u64                     err_cnt;
-       /* private */
-       u64                     sdma_int_cnt;
-       u64                     idle_int_cnt;
-       u64                     progress_int_cnt;
-
-       /* private: */
-       struct list_head      dmawait;
-
-       /* CONFIG SDMA for now, just blindly duplicate */
-       /* private: */
-       struct tasklet_struct sdma_hw_clean_up_task
-               ____cacheline_aligned_in_smp;
-
-       /* private: */
-       struct tasklet_struct sdma_sw_clean_up_task
-               ____cacheline_aligned_in_smp;
-       /* private: */
-       struct work_struct err_halt_worker;
-       /* private */
-       struct timer_list     err_progress_check_timer;
-       u32                   progress_check_head;
-       /* private: */
-       struct work_struct flush_worker;
-       /* protect flush list */
-       spinlock_t flushlist_lock;
-       /* private: */
-       struct list_head flushlist;
-};
-
-int sdma_init(struct hfi1_devdata *dd, u8 port);
-void sdma_start(struct hfi1_devdata *dd);
-void sdma_exit(struct hfi1_devdata *dd);
-void sdma_all_running(struct hfi1_devdata *dd);
-void sdma_all_idle(struct hfi1_devdata *dd);
-void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
-void sdma_freeze(struct hfi1_devdata *dd);
-void sdma_unfreeze(struct hfi1_devdata *dd);
-void sdma_wait(struct hfi1_devdata *dd);
-
-/**
- * sdma_empty() - idle engine test
- * @engine: sdma engine
- *
- * Currently used by verbs as a latency optimization.
- *
- * Return:
- * 1 - empty, 0 - non-empty
- */
-static inline int sdma_empty(struct sdma_engine *sde)
-{
-       return sde->descq_tail == sde->descq_head;
-}
-
-static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
-{
-       return sde->descq_cnt -
-               (sde->descq_tail -
-                ACCESS_ONCE(sde->descq_head)) - 1;
-}
-
-static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
-{
-       return sde->descq_cnt - sdma_descq_freecnt(sde);
-}
-
-/*
- * Either head_lock or tail lock required to see
- * a steady state.
- */
-static inline int __sdma_running(struct sdma_engine *engine)
-{
-       return engine->state.current_state == sdma_state_s99_running;
-}
-
-/**
- * sdma_running() - state suitability test
- * @engine: sdma engine
- *
- * sdma_running probes the internal state to determine if it is suitable
- * for submitting packets.
- *
- * Return:
- * 1 - ok to submit, 0 - not ok to submit
- *
- */
-static inline int sdma_running(struct sdma_engine *engine)
-{
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&engine->tail_lock, flags);
-       ret = __sdma_running(engine);
-       spin_unlock_irqrestore(&engine->tail_lock, flags);
-       return ret;
-}
-
-void _sdma_txreq_ahgadd(
-       struct sdma_txreq *tx,
-       u8 num_ahg,
-       u8 ahg_entry,
-       u32 *ahg,
-       u8 ahg_hlen);
-
-/**
- * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
- * @tx: tx request to initialize
- * @flags: flags to key last descriptor additions
- * @tlen: total packet length (pbc + headers + data)
- * @ahg_entry: ahg entry to use  (0 - 31)
- * @num_ahg: ahg descriptor for first descriptor (0 - 9)
- * @ahg: array of AHG descriptors (up to 9 entries)
- * @ahg_hlen: number of bytes from ASIC entry to use
- * @cb: callback
- *
- * The allocation of the sdma_txreq and it enclosing structure is user
- * dependent.  This routine must be called to initialize the user independent
- * fields.
- *
- * The currently supported flags are SDMA_TXREQ_F_URGENT,
- * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
- *
- * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
- * completion is desired as soon as possible.
- *
- * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
- * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
- * the AHG descriptors into the first 1 to 3 descriptors.
- *
- * Completions of submitted requests can be gotten on selected
- * txreqs by giving a completion routine callback to sdma_txinit() or
- * sdma_txinit_ahg().  The environment in which the callback runs
- * can be from an ISR, a tasklet, or a thread, so no sleeping
- * kernel routines can be used.   Aspects of the sdma ring may
- * be locked so care should be taken with locking.
- *
- * The callback pointer can be NULL to avoid any callback for the packet
- * being submitted. The callback will be provided this tx, a status, and a flag.
- *
- * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
- * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
- *
- * The flag, if the is the iowait had been used, indicates the iowait
- * sdma_busy count has reached zero.
- *
- * user data portion of tlen should be precise.   The sdma_txadd_* entrances
- * will pad with a descriptor references 1 - 3 bytes when the number of bytes
- * specified in tlen have been supplied to the sdma_txreq.
- *
- * ahg_hlen is used to determine the number of on-chip entry bytes to
- * use as the header.   This is for cases where the stored header is
- * larger than the header to be used in a packet.  This is typical
- * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
- * and RDMA_WRITE_MIDDLE.
- *
- */
-static inline int sdma_txinit_ahg(
-       struct sdma_txreq *tx,
-       u16 flags,
-       u16 tlen,
-       u8 ahg_entry,
-       u8 num_ahg,
-       u32 *ahg,
-       u8 ahg_hlen,
-       void (*cb)(struct sdma_txreq *, int))
-{
-       if (tlen == 0)
-               return -ENODATA;
-       if (tlen > MAX_SDMA_PKT_SIZE)
-               return -EMSGSIZE;
-       tx->desc_limit = ARRAY_SIZE(tx->descs);
-       tx->descp = &tx->descs[0];
-       INIT_LIST_HEAD(&tx->list);
-       tx->num_desc = 0;
-       tx->flags = flags;
-       tx->complete = cb;
-       tx->coalesce_buf = NULL;
-       tx->wait = NULL;
-       tx->packet_len = tlen;
-       tx->tlen = tx->packet_len;
-       tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
-       tx->descs[0].qw[1] = 0;
-       if (flags & SDMA_TXREQ_F_AHG_COPY)
-               tx->descs[0].qw[1] |=
-                       (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
-                               << SDMA_DESC1_HEADER_INDEX_SHIFT) |
-                       (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
-                               << SDMA_DESC1_HEADER_MODE_SHIFT);
-       else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
-               _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
-       return 0;
-}
-
-/**
- * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
- * @tx: tx request to initialize
- * @flags: flags to key last descriptor additions
- * @tlen: total packet length (pbc + headers + data)
- * @cb: callback pointer
- *
- * The allocation of the sdma_txreq and it enclosing structure is user
- * dependent.  This routine must be called to initialize the user
- * independent fields.
- *
- * The currently supported flags is SDMA_TXREQ_F_URGENT.
- *
- * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
- * completion is desired as soon as possible.
- *
- * Completions of submitted requests can be gotten on selected
- * txreqs by giving a completion routine callback to sdma_txinit() or
- * sdma_txinit_ahg().  The environment in which the callback runs
- * can be from an ISR, a tasklet, or a thread, so no sleeping
- * kernel routines can be used.   The head size of the sdma ring may
- * be locked so care should be taken with locking.
- *
- * The callback pointer can be NULL to avoid any callback for the packet
- * being submitted.
- *
- * The callback, if non-NULL,  will be provided this tx and a status.  The
- * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
- * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
- *
- */
-static inline int sdma_txinit(
-       struct sdma_txreq *tx,
-       u16 flags,
-       u16 tlen,
-       void (*cb)(struct sdma_txreq *, int))
-{
-       return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
-}
-
-/* helpers - don't use */
-static inline int sdma_mapping_type(struct sdma_desc *d)
-{
-       return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
-               >> SDMA_DESC1_GENERATION_SHIFT;
-}
-
-static inline size_t sdma_mapping_len(struct sdma_desc *d)
-{
-       return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
-               >> SDMA_DESC0_BYTE_COUNT_SHIFT;
-}
-
-static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
-{
-       return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
-               >> SDMA_DESC0_PHY_ADDR_SHIFT;
-}
-
-static inline void make_tx_sdma_desc(
-       struct sdma_txreq *tx,
-       int type,
-       dma_addr_t addr,
-       size_t len)
-{
-       struct sdma_desc *desc = &tx->descp[tx->num_desc];
-
-       if (!tx->num_desc) {
-               /* qw[0] zero; qw[1] first, ahg mode already in from init */
-               desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
-                               << SDMA_DESC1_GENERATION_SHIFT;
-       } else {
-               desc->qw[0] = 0;
-               desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
-                               << SDMA_DESC1_GENERATION_SHIFT;
-       }
-       desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
-                               << SDMA_DESC0_PHY_ADDR_SHIFT) |
-                       (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
-                               << SDMA_DESC0_BYTE_COUNT_SHIFT);
-}
-
-/* helper to extend txreq */
-int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
-                          int type, void *kvaddr, struct page *page,
-                          unsigned long offset, u16 len);
-int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
-void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
-
-/* helpers used by public routines */
-static inline void _sdma_close_tx(struct hfi1_devdata *dd,
-                                 struct sdma_txreq *tx)
-{
-       tx->descp[tx->num_desc].qw[0] |=
-               SDMA_DESC0_LAST_DESC_FLAG;
-       tx->descp[tx->num_desc].qw[1] |=
-               dd->default_desc1;
-       if (tx->flags & SDMA_TXREQ_F_URGENT)
-               tx->descp[tx->num_desc].qw[1] |=
-                       (SDMA_DESC1_HEAD_TO_HOST_FLAG |
-                        SDMA_DESC1_INT_REQ_FLAG);
-}
-
-static inline int _sdma_txadd_daddr(
-       struct hfi1_devdata *dd,
-       int type,
-       struct sdma_txreq *tx,
-       dma_addr_t addr,
-       u16 len)
-{
-       int rval = 0;
-
-       make_tx_sdma_desc(
-               tx,
-               type,
-               addr, len);
-       WARN_ON(len > tx->tlen);
-       tx->tlen -= len;
-       /* special cases for last */
-       if (!tx->tlen) {
-               if (tx->packet_len & (sizeof(u32) - 1)) {
-                       rval = _pad_sdma_tx_descs(dd, tx);
-                       if (rval)
-                               return rval;
-               } else {
-                       _sdma_close_tx(dd, tx);
-               }
-       }
-       tx->num_desc++;
-       return rval;
-}
-
-/**
- * sdma_txadd_page() - add a page to the sdma_txreq
- * @dd: the device to use for mapping
- * @tx: tx request to which the page is added
- * @page: page to map
- * @offset: offset within the page
- * @len: length in bytes
- *
- * This is used to add a page/offset/length descriptor.
- *
- * The mapping/unmapping of the page/offset/len is automatically handled.
- *
- * Return:
- * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
- * extend/coalesce descriptor array
- */
-static inline int sdma_txadd_page(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       struct page *page,
-       unsigned long offset,
-       u16 len)
-{
-       dma_addr_t addr;
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
-                                             NULL, page, offset, len);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       addr = dma_map_page(
-                      &dd->pcidev->dev,
-                      page,
-                      offset,
-                      len,
-                      DMA_TO_DEVICE);
-
-       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-               sdma_txclean(dd, tx);
-               return -ENOSPC;
-       }
-
-       return _sdma_txadd_daddr(
-                       dd, SDMA_MAP_PAGE, tx, addr, len);
-}
-
-/**
- * sdma_txadd_daddr() - add a dma address to the sdma_txreq
- * @dd: the device to use for mapping
- * @tx: sdma_txreq to which the page is added
- * @addr: dma address mapped by caller
- * @len: length in bytes
- *
- * This is used to add a descriptor for memory that is already dma mapped.
- *
- * In this case, there is no unmapping as part of the progress processing for
- * this memory location.
- *
- * Return:
- * 0 - success, -ENOMEM - couldn't extend descriptor array
- */
-
-static inline int sdma_txadd_daddr(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       dma_addr_t addr,
-       u16 len)
-{
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
-                                             NULL, NULL, 0, 0);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
-}
-
-/**
- * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
- * @dd: the device to use for mapping
- * @tx: sdma_txreq to which the page is added
- * @kvaddr: the kernel virtual address
- * @len: length in bytes
- *
- * This is used to add a descriptor referenced by the indicated kvaddr and
- * len.
- *
- * The mapping/unmapping of the kvaddr and len is automatically handled.
- *
- * Return:
- * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
- * descriptor array
- */
-static inline int sdma_txadd_kvaddr(
-       struct hfi1_devdata *dd,
-       struct sdma_txreq *tx,
-       void *kvaddr,
-       u16 len)
-{
-       dma_addr_t addr;
-       int rval;
-
-       if ((unlikely(tx->num_desc == tx->desc_limit))) {
-               rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
-                                             kvaddr, NULL, 0, len);
-               if (rval <= 0)
-                       return rval;
-       }
-
-       addr = dma_map_single(
-                      &dd->pcidev->dev,
-                      kvaddr,
-                      len,
-                      DMA_TO_DEVICE);
-
-       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-               sdma_txclean(dd, tx);
-               return -ENOSPC;
-       }
-
-       return _sdma_txadd_daddr(
-                       dd, SDMA_MAP_SINGLE, tx, addr, len);
-}
-
-struct iowait;
-
-int sdma_send_txreq(struct sdma_engine *sde,
-                   struct iowait *wait,
-                   struct sdma_txreq *tx);
-int sdma_send_txlist(struct sdma_engine *sde,
-                    struct iowait *wait,
-                    struct list_head *tx_list);
-
-int sdma_ahg_alloc(struct sdma_engine *sde);
-void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
-
-/**
- * sdma_build_ahg - build ahg descriptor
- * @data
- * @dwindex
- * @startbit
- * @bits
- *
- * Build and return a 32 bit descriptor.
- */
-static inline u32 sdma_build_ahg_descriptor(
-       u16 data,
-       u8 dwindex,
-       u8 startbit,
-       u8 bits)
-{
-       return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
-               ((startbit & SDMA_AHG_FIELD_START_MASK) <<
-               SDMA_AHG_FIELD_START_SHIFT) |
-               ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
-               SDMA_AHG_FIELD_LEN_SHIFT) |
-               ((dwindex & SDMA_AHG_INDEX_MASK) <<
-               SDMA_AHG_INDEX_SHIFT) |
-               ((data & SDMA_AHG_VALUE_MASK) <<
-               SDMA_AHG_VALUE_SHIFT));
-}
-
-/**
- * sdma_progress - use seq number of detect head progress
- * @sde: sdma_engine to check
- * @seq: base seq count
- * @tx: txreq for which we need to check descriptor availability
- *
- * This is used in the appropriate spot in the sleep routine
- * to check for potential ring progress.  This routine gets the
- * seqcount before queuing the iowait structure for progress.
- *
- * If the seqcount indicates that progress needs to be checked,
- * re-submission is detected by checking whether the descriptor
- * queue has enough descriptor for the txreq.
- */
-static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
-                                    struct sdma_txreq *tx)
-{
-       if (read_seqretry(&sde->head_lock, seq)) {
-               sde->desc_avail = sdma_descq_freecnt(sde);
-               if (tx->num_desc > sde->desc_avail)
-                       return 0;
-               return 1;
-       }
-       return 0;
-}
-
-/**
- * sdma_iowait_schedule() - initialize wait structure
- * @sde: sdma_engine to schedule
- * @wait: wait struct to schedule
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
-static inline void sdma_iowait_schedule(
-       struct sdma_engine *sde,
-       struct iowait *wait)
-{
-       struct hfi1_pportdata *ppd = sde->dd->pport;
-
-       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
-}
-
-/* for use by interrupt handling */
-void sdma_engine_error(struct sdma_engine *sde, u64 status);
-void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
-
-/*
- *
- * The diagram below details the relationship of the mapping structures
- *
- * Since the mapping now allows for non-uniform engines per vl, the
- * number of engines for a vl is either the vl_engines[vl] or
- * a computation based on num_sdma/num_vls:
- *
- * For example:
- * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
- *
- * n = roundup to next highest power of 2 using nactual
- *
- * In the case where there are num_sdma/num_vls doesn't divide
- * evenly, the extras are added from the last vl downward.
- *
- * For the case where n > nactual, the engines are assigned
- * in a round robin fashion wrapping back to the first engine
- * for a particular vl.
- *
- *               dd->sdma_map
- *                    |                                   sdma_map_elem[0]
- *                    |                                +--------------------+
- *                    v                                |       mask         |
- *               sdma_vl_map                           |--------------------|
- *      +--------------------------+                   | sde[0] -> eng 1    |
- *      |    list (RCU)            |                   |--------------------|
- *      |--------------------------|                 ->| sde[1] -> eng 2    |
- *      |    mask                  |              --/  |--------------------|
- *      |--------------------------|            -/     |        *           |
- *      |    actual_vls (max 8)    |          -/       |--------------------|
- *      |--------------------------|       --/         | sde[n] -> eng n    |
- *      |    vls (max 8)           |     -/            +--------------------+
- *      |--------------------------|  --/
- *      |    map[0]                |-/
- *      |--------------------------|                   +--------------------+
- *      |    map[1]                |---                |       mask         |
- *      |--------------------------|   \----           |--------------------|
- *      |           *              |        \--        | sde[0] -> eng 1+n  |
- *      |           *              |           \----   |--------------------|
- *      |           *              |                \->| sde[1] -> eng 2+n  |
- *      |--------------------------|                   |--------------------|
- *      |   map[vls - 1]           |-                  |         *          |
- *      +--------------------------+ \-                |--------------------|
- *                                     \-              | sde[m] -> eng m+n  |
- *                                       \             +--------------------+
- *                                        \-
- *                                          \
- *                                           \-        +--------------------+
- *                                             \-      |       mask         |
- *                                               \     |--------------------|
- *                                                \-   | sde[0] -> eng 1+m+n|
- *                                                  \- |--------------------|
- *                                                    >| sde[1] -> eng 2+m+n|
- *                                                     |--------------------|
- *                                                     |         *          |
- *                                                     |--------------------|
- *                                                     | sde[o] -> eng o+m+n|
- *                                                     +--------------------+
- *
- */
-
-/**
- * struct sdma_map_elem - mapping for a vl
- * @mask - selector mask
- * @sde - array of engines for this vl
- *
- * The mask is used to "mod" the selector
- * to produce index into the trailing
- * array of sdes.
- */
-struct sdma_map_elem {
-       u32 mask;
-       struct sdma_engine *sde[0];
-};
-
-/**
- * struct sdma_map_el - mapping for a vl
- * @engine_to_vl - map of an engine to a vl
- * @list - rcu head for free callback
- * @mask - vl mask to "mod" the vl to produce an index to map array
- * @actual_vls - number of vls
- * @vls - number of vls rounded to next power of 2
- * @map - array of sdma_map_elem entries
- *
- * This is the parent mapping structure.  The trailing
- * members of the struct point to sdma_map_elem entries, which
- * in turn point to an array of sde's for that vl.
- */
-struct sdma_vl_map {
-       s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
-       struct rcu_head list;
-       u32 mask;
-       u8 actual_vls;
-       u8 vls;
-       struct sdma_map_elem *map[0];
-};
-
-int sdma_map_init(
-       struct hfi1_devdata *dd,
-       u8 port,
-       u8 num_vls,
-       u8 *vl_engines);
-
-/* slow path */
-void _sdma_engine_progress_schedule(struct sdma_engine *sde);
-
-/**
- * sdma_engine_progress_schedule() - schedule progress on engine
- * @sde: sdma_engine to schedule progress
- *
- * This is the fast path.
- *
- */
-static inline void sdma_engine_progress_schedule(
-       struct sdma_engine *sde)
-{
-       if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
-               return;
-       _sdma_engine_progress_schedule(sde);
-}
-
-struct sdma_engine *sdma_select_engine_sc(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 sc5);
-
-struct sdma_engine *sdma_select_engine_vl(
-       struct hfi1_devdata *dd,
-       u32 selector,
-       u8 vl);
-
-void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
-
-#ifdef CONFIG_SDMA_VERBOSITY
-void sdma_dumpstate(struct sdma_engine *);
-#endif
-static inline char *slashstrip(char *s)
-{
-       char *r = s;
-
-       while (*s)
-               if (*s++ == '/')
-                       r = s;
-       return r;
-}
-
-u16 sdma_get_descq_cnt(void);
-
-extern uint mod_num_sdma;
-
-void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
-
-#endif
diff --git a/drivers/staging/rdma/hfi1/sdma_txreq.h b/drivers/staging/rdma/hfi1/sdma_txreq.h
deleted file mode 100644 (file)
index bf7d777..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_SDMA_TXREQ_H
-#define HFI1_SDMA_TXREQ_H
-
-/* increased for AHG */
-#define NUM_DESC 6
-
-/*
- * struct sdma_desc - canonical fragment descriptor
- *
- * This is the descriptor carried in the tx request
- * corresponding to each fragment.
- *
- */
-struct sdma_desc {
-       /* private:  don't use directly */
-       u64 qw[2];
-};
-
-/**
- * struct sdma_txreq - the sdma_txreq structure (one per packet)
- * @list: for use by user and by queuing for wait
- *
- * This is the representation of a packet which consists of some
- * number of fragments.   Storage is provided to within the structure.
- * for all fragments.
- *
- * The storage for the descriptors are automatically extended as needed
- * when the currently allocation is exceeded.
- *
- * The user (Verbs or PSM) may overload this structure with fields
- * specific to their use by putting this struct first in their struct.
- * The method of allocation of the overloaded structure is user dependent
- *
- * The list is the only public field in the structure.
- *
- */
-
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
-struct sdma_txreq;
-typedef void (*callback_t)(struct sdma_txreq *, int);
-
-struct iowait;
-struct sdma_txreq {
-       struct list_head list;
-       /* private: */
-       struct sdma_desc *descp;
-       /* private: */
-       void *coalesce_buf;
-       /* private: */
-       struct iowait *wait;
-       /* private: */
-       callback_t                  complete;
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-       u64 sn;
-#endif
-       /* private: - used in coalesce/pad processing */
-       u16                         packet_len;
-       /* private: - down-counted to trigger last */
-       u16                         tlen;
-       /* private: */
-       u16                         num_desc;
-       /* private: */
-       u16                         desc_limit;
-       /* private: */
-       u16                         next_descq_idx;
-       /* private: */
-       u16 coalesce_idx;
-       /* private: flags */
-       u16                         flags;
-       /* private: */
-       struct sdma_desc descs[NUM_DESC];
-};
-
-static inline int sdma_txreq_built(struct sdma_txreq *tx)
-{
-       return tx->num_desc;
-}
-
-#endif                          /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
deleted file mode 100644 (file)
index 8cd6df8..0000000
+++ /dev/null
@@ -1,785 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/ctype.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "trace.h"
-
-/*
- * Start of per-port congestion control structures and support code
- */
-
-/*
- * Congestion control table size followed by table entries
- */
-static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
-                                struct bin_attribute *bin_attr,
-                                char *buf, loff_t pos, size_t count)
-{
-       int ret;
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-       struct cc_state *cc_state;
-
-       ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
-                + sizeof(__be16);
-
-       if (pos > ret)
-               return -EINVAL;
-
-       if (count > ret - pos)
-               count = ret - pos;
-
-       if (!count)
-               return count;
-
-       rcu_read_lock();
-       cc_state = get_cc_state(ppd);
-       if (!cc_state) {
-               rcu_read_unlock();
-               return -EINVAL;
-       }
-       memcpy(buf, (void *)&cc_state->cct + pos, count);
-       rcu_read_unlock();
-
-       return count;
-}
-
-static void port_release(struct kobject *kobj)
-{
-       /* nothing to do since memory is freed by hfi1_free_devdata() */
-}
-
-static struct bin_attribute cc_table_bin_attr = {
-       .attr = {.name = "cc_table_bin", .mode = 0444},
-       .read = read_cc_table_bin,
-       .size = PAGE_SIZE,
-};
-
-/*
- * Congestion settings: port control, control map and an array of 16
- * entries for the congestion entries - increase, timer, event log
- * trigger threshold and the minimum injection rate delay.
- */
-static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
-                                  struct bin_attribute *bin_attr,
-                                  char *buf, loff_t pos, size_t count)
-{
-       int ret;
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-       struct cc_state *cc_state;
-
-       ret = sizeof(struct opa_congestion_setting_attr_shadow);
-
-       if (pos > ret)
-               return -EINVAL;
-       if (count > ret - pos)
-               count = ret - pos;
-
-       if (!count)
-               return count;
-
-       rcu_read_lock();
-       cc_state = get_cc_state(ppd);
-       if (!cc_state) {
-               rcu_read_unlock();
-               return -EINVAL;
-       }
-       memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
-       rcu_read_unlock();
-
-       return count;
-}
-
-static struct bin_attribute cc_setting_bin_attr = {
-       .attr = {.name = "cc_settings_bin", .mode = 0444},
-       .read = read_cc_setting_bin,
-       .size = PAGE_SIZE,
-};
-
-struct hfi1_port_attr {
-       struct attribute attr;
-       ssize_t (*show)(struct hfi1_pportdata *, char *);
-       ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t);
-};
-
-static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
-{
-       return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
-}
-
-static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
-                               size_t count)
-{
-       if (!memcmp(buf, "on", 2))
-               ppd->cc_prescan = true;
-       else if (!memcmp(buf, "off", 3))
-               ppd->cc_prescan = false;
-
-       return count;
-}
-
-static struct hfi1_port_attr cc_prescan_attr =
-               __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
-
-static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
-                           char *buf)
-{
-       struct hfi1_port_attr *port_attr =
-               container_of(attr, struct hfi1_port_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-
-       return port_attr->show(ppd, buf);
-}
-
-static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
-                            const char *buf, size_t count)
-{
-       struct hfi1_port_attr *port_attr =
-               container_of(attr, struct hfi1_port_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
-
-       return port_attr->store(ppd, buf, count);
-}
-
-static const struct sysfs_ops port_cc_sysfs_ops = {
-       .show = cc_attr_show,
-       .store = cc_attr_store
-};
-
-static struct attribute *port_cc_default_attributes[] = {
-       &cc_prescan_attr.attr
-};
-
-static struct kobj_type port_cc_ktype = {
-       .release = port_release,
-       .sysfs_ops = &port_cc_sysfs_ops,
-       .default_attrs = port_cc_default_attributes
-};
-
-/* Start sc2vl */
-#define HFI1_SC2VL_ATTR(N)                                 \
-       static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .sc = N \
-       }
-
-struct hfi1_sc2vl_attr {
-       struct attribute attr;
-       int sc;
-};
-
-HFI1_SC2VL_ATTR(0);
-HFI1_SC2VL_ATTR(1);
-HFI1_SC2VL_ATTR(2);
-HFI1_SC2VL_ATTR(3);
-HFI1_SC2VL_ATTR(4);
-HFI1_SC2VL_ATTR(5);
-HFI1_SC2VL_ATTR(6);
-HFI1_SC2VL_ATTR(7);
-HFI1_SC2VL_ATTR(8);
-HFI1_SC2VL_ATTR(9);
-HFI1_SC2VL_ATTR(10);
-HFI1_SC2VL_ATTR(11);
-HFI1_SC2VL_ATTR(12);
-HFI1_SC2VL_ATTR(13);
-HFI1_SC2VL_ATTR(14);
-HFI1_SC2VL_ATTR(15);
-HFI1_SC2VL_ATTR(16);
-HFI1_SC2VL_ATTR(17);
-HFI1_SC2VL_ATTR(18);
-HFI1_SC2VL_ATTR(19);
-HFI1_SC2VL_ATTR(20);
-HFI1_SC2VL_ATTR(21);
-HFI1_SC2VL_ATTR(22);
-HFI1_SC2VL_ATTR(23);
-HFI1_SC2VL_ATTR(24);
-HFI1_SC2VL_ATTR(25);
-HFI1_SC2VL_ATTR(26);
-HFI1_SC2VL_ATTR(27);
-HFI1_SC2VL_ATTR(28);
-HFI1_SC2VL_ATTR(29);
-HFI1_SC2VL_ATTR(30);
-HFI1_SC2VL_ATTR(31);
-
-static struct attribute *sc2vl_default_attributes[] = {
-       &hfi1_sc2vl_attr_0.attr,
-       &hfi1_sc2vl_attr_1.attr,
-       &hfi1_sc2vl_attr_2.attr,
-       &hfi1_sc2vl_attr_3.attr,
-       &hfi1_sc2vl_attr_4.attr,
-       &hfi1_sc2vl_attr_5.attr,
-       &hfi1_sc2vl_attr_6.attr,
-       &hfi1_sc2vl_attr_7.attr,
-       &hfi1_sc2vl_attr_8.attr,
-       &hfi1_sc2vl_attr_9.attr,
-       &hfi1_sc2vl_attr_10.attr,
-       &hfi1_sc2vl_attr_11.attr,
-       &hfi1_sc2vl_attr_12.attr,
-       &hfi1_sc2vl_attr_13.attr,
-       &hfi1_sc2vl_attr_14.attr,
-       &hfi1_sc2vl_attr_15.attr,
-       &hfi1_sc2vl_attr_16.attr,
-       &hfi1_sc2vl_attr_17.attr,
-       &hfi1_sc2vl_attr_18.attr,
-       &hfi1_sc2vl_attr_19.attr,
-       &hfi1_sc2vl_attr_20.attr,
-       &hfi1_sc2vl_attr_21.attr,
-       &hfi1_sc2vl_attr_22.attr,
-       &hfi1_sc2vl_attr_23.attr,
-       &hfi1_sc2vl_attr_24.attr,
-       &hfi1_sc2vl_attr_25.attr,
-       &hfi1_sc2vl_attr_26.attr,
-       &hfi1_sc2vl_attr_27.attr,
-       &hfi1_sc2vl_attr_28.attr,
-       &hfi1_sc2vl_attr_29.attr,
-       &hfi1_sc2vl_attr_30.attr,
-       &hfi1_sc2vl_attr_31.attr,
-       NULL
-};
-
-static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-       struct hfi1_sc2vl_attr *sattr =
-               container_of(attr, struct hfi1_sc2vl_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
-}
-
-static const struct sysfs_ops hfi1_sc2vl_ops = {
-       .show = sc2vl_attr_show,
-};
-
-static struct kobj_type hfi1_sc2vl_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_sc2vl_ops,
-       .default_attrs = sc2vl_default_attributes
-};
-
-/* End sc2vl */
-
-/* Start sl2sc */
-#define HFI1_SL2SC_ATTR(N)                                 \
-       static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {     \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .sl = N                                           \
-       }
-
-struct hfi1_sl2sc_attr {
-       struct attribute attr;
-       int sl;
-};
-
-HFI1_SL2SC_ATTR(0);
-HFI1_SL2SC_ATTR(1);
-HFI1_SL2SC_ATTR(2);
-HFI1_SL2SC_ATTR(3);
-HFI1_SL2SC_ATTR(4);
-HFI1_SL2SC_ATTR(5);
-HFI1_SL2SC_ATTR(6);
-HFI1_SL2SC_ATTR(7);
-HFI1_SL2SC_ATTR(8);
-HFI1_SL2SC_ATTR(9);
-HFI1_SL2SC_ATTR(10);
-HFI1_SL2SC_ATTR(11);
-HFI1_SL2SC_ATTR(12);
-HFI1_SL2SC_ATTR(13);
-HFI1_SL2SC_ATTR(14);
-HFI1_SL2SC_ATTR(15);
-HFI1_SL2SC_ATTR(16);
-HFI1_SL2SC_ATTR(17);
-HFI1_SL2SC_ATTR(18);
-HFI1_SL2SC_ATTR(19);
-HFI1_SL2SC_ATTR(20);
-HFI1_SL2SC_ATTR(21);
-HFI1_SL2SC_ATTR(22);
-HFI1_SL2SC_ATTR(23);
-HFI1_SL2SC_ATTR(24);
-HFI1_SL2SC_ATTR(25);
-HFI1_SL2SC_ATTR(26);
-HFI1_SL2SC_ATTR(27);
-HFI1_SL2SC_ATTR(28);
-HFI1_SL2SC_ATTR(29);
-HFI1_SL2SC_ATTR(30);
-HFI1_SL2SC_ATTR(31);
-
-static struct attribute *sl2sc_default_attributes[] = {
-       &hfi1_sl2sc_attr_0.attr,
-       &hfi1_sl2sc_attr_1.attr,
-       &hfi1_sl2sc_attr_2.attr,
-       &hfi1_sl2sc_attr_3.attr,
-       &hfi1_sl2sc_attr_4.attr,
-       &hfi1_sl2sc_attr_5.attr,
-       &hfi1_sl2sc_attr_6.attr,
-       &hfi1_sl2sc_attr_7.attr,
-       &hfi1_sl2sc_attr_8.attr,
-       &hfi1_sl2sc_attr_9.attr,
-       &hfi1_sl2sc_attr_10.attr,
-       &hfi1_sl2sc_attr_11.attr,
-       &hfi1_sl2sc_attr_12.attr,
-       &hfi1_sl2sc_attr_13.attr,
-       &hfi1_sl2sc_attr_14.attr,
-       &hfi1_sl2sc_attr_15.attr,
-       &hfi1_sl2sc_attr_16.attr,
-       &hfi1_sl2sc_attr_17.attr,
-       &hfi1_sl2sc_attr_18.attr,
-       &hfi1_sl2sc_attr_19.attr,
-       &hfi1_sl2sc_attr_20.attr,
-       &hfi1_sl2sc_attr_21.attr,
-       &hfi1_sl2sc_attr_22.attr,
-       &hfi1_sl2sc_attr_23.attr,
-       &hfi1_sl2sc_attr_24.attr,
-       &hfi1_sl2sc_attr_25.attr,
-       &hfi1_sl2sc_attr_26.attr,
-       &hfi1_sl2sc_attr_27.attr,
-       &hfi1_sl2sc_attr_28.attr,
-       &hfi1_sl2sc_attr_29.attr,
-       &hfi1_sl2sc_attr_30.attr,
-       &hfi1_sl2sc_attr_31.attr,
-       NULL
-};
-
-static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-       struct hfi1_sl2sc_attr *sattr =
-               container_of(attr, struct hfi1_sl2sc_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-
-       return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
-}
-
-static const struct sysfs_ops hfi1_sl2sc_ops = {
-       .show = sl2sc_attr_show,
-};
-
-static struct kobj_type hfi1_sl2sc_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_sl2sc_ops,
-       .default_attrs = sl2sc_default_attributes
-};
-
-/* End sl2sc */
-
-/* Start vl2mtu */
-
-#define HFI1_VL2MTU_ATTR(N) \
-       static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
-               .attr = { .name = __stringify(N), .mode = 0444 }, \
-               .vl = N                                           \
-       }
-
-struct hfi1_vl2mtu_attr {
-       struct attribute attr;
-       int vl;
-};
-
-HFI1_VL2MTU_ATTR(0);
-HFI1_VL2MTU_ATTR(1);
-HFI1_VL2MTU_ATTR(2);
-HFI1_VL2MTU_ATTR(3);
-HFI1_VL2MTU_ATTR(4);
-HFI1_VL2MTU_ATTR(5);
-HFI1_VL2MTU_ATTR(6);
-HFI1_VL2MTU_ATTR(7);
-HFI1_VL2MTU_ATTR(8);
-HFI1_VL2MTU_ATTR(9);
-HFI1_VL2MTU_ATTR(10);
-HFI1_VL2MTU_ATTR(11);
-HFI1_VL2MTU_ATTR(12);
-HFI1_VL2MTU_ATTR(13);
-HFI1_VL2MTU_ATTR(14);
-HFI1_VL2MTU_ATTR(15);
-
-static struct attribute *vl2mtu_default_attributes[] = {
-       &hfi1_vl2mtu_attr_0.attr,
-       &hfi1_vl2mtu_attr_1.attr,
-       &hfi1_vl2mtu_attr_2.attr,
-       &hfi1_vl2mtu_attr_3.attr,
-       &hfi1_vl2mtu_attr_4.attr,
-       &hfi1_vl2mtu_attr_5.attr,
-       &hfi1_vl2mtu_attr_6.attr,
-       &hfi1_vl2mtu_attr_7.attr,
-       &hfi1_vl2mtu_attr_8.attr,
-       &hfi1_vl2mtu_attr_9.attr,
-       &hfi1_vl2mtu_attr_10.attr,
-       &hfi1_vl2mtu_attr_11.attr,
-       &hfi1_vl2mtu_attr_12.attr,
-       &hfi1_vl2mtu_attr_13.attr,
-       &hfi1_vl2mtu_attr_14.attr,
-       &hfi1_vl2mtu_attr_15.attr,
-       NULL
-};
-
-static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
-                               char *buf)
-{
-       struct hfi1_vl2mtu_attr *vlattr =
-               container_of(attr, struct hfi1_vl2mtu_attr, attr);
-       struct hfi1_pportdata *ppd =
-               container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
-       struct hfi1_devdata *dd = ppd->dd;
-
-       return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
-}
-
-static const struct sysfs_ops hfi1_vl2mtu_ops = {
-       .show = vl2mtu_attr_show,
-};
-
-static struct kobj_type hfi1_vl2mtu_ktype = {
-       .release = port_release,
-       .sysfs_ops = &hfi1_vl2mtu_ops,
-       .default_attrs = vl2mtu_default_attributes
-};
-
-/* end of per-port file structures and support code */
-
-/*
- * Start of per-unit (or driver, in some cases, but replicated
- * per unit) functions (these get a device *)
- */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-
-       return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
-}
-
-static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       int ret;
-
-       if (!dd->boardname)
-               ret = -EINVAL;
-       else
-               ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
-       return ret;
-}
-
-static ssize_t show_boardversion(struct device *device,
-                                struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
-}
-
-static ssize_t show_nctxts(struct device *device,
-                          struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /*
-        * Return the smaller of send and receive contexts.
-        * Normally, user level applications would require both a send
-        * and a receive context, so returning the smaller of the two counts
-        * give a more accurate picture of total contexts available.
-        */
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        min(dd->num_rcv_contexts - dd->first_user_ctxt,
-                            (u32)dd->sc_sizes[SC_USER].count));
-}
-
-static ssize_t show_nfreectxts(struct device *device,
-                              struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       /* Return the number of free user ports (contexts) available. */
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
-}
-
-static ssize_t show_serial(struct device *device,
-                          struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
-}
-
-static ssize_t store_chip_reset(struct device *device,
-                               struct device_attribute *attr, const char *buf,
-                               size_t count)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       int ret;
-
-       if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ret = hfi1_reset_device(dd->unit);
-bail:
-       return ret < 0 ? ret : count;
-}
-
-/*
- * Convert the reported temperature from an integer (reported in
- * units of 0.25C) to a floating point number.
- */
-#define temp2str(temp, buf, size, idx)                                 \
-       scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",            \
-                             ((temp) >> 2), ((temp) & 0x3) * 25)
-
-/*
- * Dump tempsense values, in decimal, to ease shell-scripts.
- */
-static ssize_t show_tempsense(struct device *device,
-                             struct device_attribute *attr, char *buf)
-{
-       struct hfi1_ibdev *dev =
-               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-       struct hfi1_temp temp;
-       int ret;
-
-       ret = hfi1_tempsense_rd(dd, &temp);
-       if (!ret) {
-               int idx = 0;
-
-               idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
-               idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
-               idx += scnprintf(buf + idx, PAGE_SIZE - idx,
-                               "%u %u %u\n", temp.triggers & 0x1,
-                               temp.triggers & 0x2, temp.triggers & 0x4);
-               ret = idx;
-       }
-       return ret;
-}
-
-/*
- * end of per-unit (or driver, in some cases, but replicated
- * per unit) functions
- */
-
-/* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
-
-static struct device_attribute *hfi1_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_board_id,
-       &dev_attr_nctxts,
-       &dev_attr_nfreectxts,
-       &dev_attr_serial,
-       &dev_attr_boardversion,
-       &dev_attr_tempsense,
-       &dev_attr_chip_reset,
-};
-
-int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
-                          struct kobject *kobj)
-{
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
-       int ret;
-
-       if (!port_num || port_num > dd->num_pports) {
-               dd_dev_err(dd,
-                          "Skipping infiniband class with invalid port %u\n",
-                          port_num);
-               return -ENODEV;
-       }
-       ppd = &dd->pport[port_num - 1];
-
-       ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
-                                  "sc2vl");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping sc2vl sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail;
-       }
-       kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
-                                  "sl2sc");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping sl2sc sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_sc2vl;
-       }
-       kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
-                                  "vl2mtu");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping vl2mtu sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_sl2sc;
-       }
-       kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
-
-       ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
-                                  kobj, "CCMgtA");
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_vl2mtu;
-       }
-
-       kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
-
-       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_cc;
-       }
-
-       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
-       if (ret) {
-               dd_dev_err(dd,
-                          "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
-                          ret, port_num);
-               goto bail_cc_entry_bin;
-       }
-
-       dd_dev_info(dd,
-                   "IB%u: Congestion Control Agent enabled for port %d\n",
-                   dd->unit, port_num);
-
-       return 0;
-
-bail_cc_entry_bin:
-       sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                             &cc_setting_bin_attr);
-bail_cc:
-       kobject_put(&ppd->pport_cc_kobj);
-bail_vl2mtu:
-       kobject_put(&ppd->vl2mtu_kobj);
-bail_sl2sc:
-       kobject_put(&ppd->sl2sc_kobj);
-bail_sc2vl:
-       kobject_put(&ppd->sc2vl_kobj);
-bail:
-       return ret;
-}
-
-/*
- * Register and create our files in /sys/class/infiniband.
- */
-int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
-{
-       struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
-       int i, ret;
-
-       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
-               ret = device_create_file(&dev->dev, hfi1_attributes[i]);
-               if (ret)
-                       goto bail;
-       }
-
-       return 0;
-bail:
-       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
-               device_remove_file(&dev->dev, hfi1_attributes[i]);
-       return ret;
-}
-
-/*
- * Unregister and remove our files in /sys/class/infiniband.
- */
-void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
-{
-       struct hfi1_pportdata *ppd;
-       int i;
-
-       for (i = 0; i < dd->num_pports; i++) {
-               ppd = &dd->pport[i];
-
-               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                                     &cc_setting_bin_attr);
-               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
-                                     &cc_table_bin_attr);
-               kobject_put(&ppd->pport_cc_kobj);
-               kobject_put(&ppd->vl2mtu_kobj);
-               kobject_put(&ppd->sl2sc_kobj);
-               kobject_put(&ppd->sc2vl_kobj);
-       }
-}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
deleted file mode 100644 (file)
index 8b62fef..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
-{
-       struct hfi1_other_headers *ohdr;
-       u8 opcode;
-       u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-
-       if (lnh == HFI1_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else
-               ohdr = &hdr->u.l.oth;
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       return hdr_len_by_opcode[opcode] == 0 ?
-              0 : hdr_len_by_opcode[opcode] - (12 + 8);
-}
-
-#define IMM_PRN  "imm %d"
-#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
-#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
-#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
-#define ATOMICACKETH_PRN "origdata %lld"
-#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
-
-#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
-
-static u64 ib_u64_get(__be32 *p)
-{
-       return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
-}
-
-static const char *parse_syndrome(u8 syndrome)
-{
-       switch (syndrome >> 5) {
-       case 0:
-               return "ACK";
-       case 1:
-               return "RNRNAK";
-       case 3:
-               return "NAK";
-       }
-       return "";
-}
-
-const char *parse_everbs_hdrs(
-       struct trace_seq *p,
-       u8 opcode,
-       void *ehdrs)
-{
-       union ib_ehdrs *eh = ehdrs;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       switch (opcode) {
-       /* imm */
-       case OP(RC, SEND_LAST_WITH_IMMEDIATE):
-       case OP(UC, SEND_LAST_WITH_IMMEDIATE):
-       case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
-       case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
-       case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
-       case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               trace_seq_printf(p, IMM_PRN,
-                                be32_to_cpu(eh->imm_data));
-               break;
-       /* reth + imm */
-       case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-       case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               trace_seq_printf(p, RETH_PRN " " IMM_PRN,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->rc.reth.vaddr),
-                                be32_to_cpu(eh->rc.reth.rkey),
-                                be32_to_cpu(eh->rc.reth.length),
-                                be32_to_cpu(eh->rc.imm_data));
-               break;
-       /* reth */
-       case OP(RC, RDMA_READ_REQUEST):
-       case OP(RC, RDMA_WRITE_FIRST):
-       case OP(UC, RDMA_WRITE_FIRST):
-       case OP(RC, RDMA_WRITE_ONLY):
-       case OP(UC, RDMA_WRITE_ONLY):
-               trace_seq_printf(p, RETH_PRN,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->rc.reth.vaddr),
-                                be32_to_cpu(eh->rc.reth.rkey),
-                                be32_to_cpu(eh->rc.reth.length));
-               break;
-       case OP(RC, RDMA_READ_RESPONSE_FIRST):
-       case OP(RC, RDMA_READ_RESPONSE_LAST):
-       case OP(RC, RDMA_READ_RESPONSE_ONLY):
-       case OP(RC, ACKNOWLEDGE):
-               trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
-                                parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
-                                be32_to_cpu(eh->aeth) & HFI1_MSN_MASK);
-               break;
-       /* aeth + atomicacketh */
-       case OP(RC, ATOMIC_ACKNOWLEDGE):
-               trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
-                                be32_to_cpu(eh->at.aeth) >> 24,
-                                parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
-                                be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK,
-                                (unsigned long long)
-                                ib_u64_get(eh->at.atomic_ack_eth));
-               break;
-       /* atomiceth */
-       case OP(RC, COMPARE_SWAP):
-       case OP(RC, FETCH_ADD):
-               trace_seq_printf(p, ATOMICETH_PRN,
-                                (unsigned long long)ib_u64_get(
-                                eh->atomic_eth.vaddr),
-                                eh->atomic_eth.rkey,
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->atomic_eth.swap_data),
-                                (unsigned long long)ib_u64_get(
-                                (__be32 *)&eh->atomic_eth.compare_data));
-               break;
-       /* deth */
-       case OP(UD, SEND_ONLY):
-       case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
-               trace_seq_printf(p, DETH_PRN,
-                                be32_to_cpu(eh->ud.deth[0]),
-                                be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
-               break;
-       }
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-const char *parse_sdma_flags(
-       struct trace_seq *p,
-       u64 desc0, u64 desc1)
-{
-       const char *ret = trace_seq_buffer_ptr(p);
-       char flags[5] = { 'x', 'x', 'x', 'x', 0 };
-
-       flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
-       flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
-       flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
-       flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
-       trace_seq_printf(p, "%s", flags);
-       if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
-               trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
-                                     SDMA_DESC1_HEADER_MODE_MASK),
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
-                                     SDMA_DESC1_HEADER_INDEX_MASK),
-                                (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
-                                     SDMA_DESC1_HEADER_DWS_MASK));
-       return ret;
-}
-
-const char *print_u32_array(
-       struct trace_seq *p,
-       u32 *arr, int len)
-{
-       int i;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       for (i = 0; i < len ; i++)
-               trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-const char *print_u64_array(
-       struct trace_seq *p,
-       u64 *arr, int len)
-{
-       int i;
-       const char *ret = trace_seq_buffer_ptr(p);
-
-       for (i = 0; i < len; i++)
-               trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
-       trace_seq_putc(p, 0);
-       return ret;
-}
-
-__hfi1_trace_fn(PKT);
-__hfi1_trace_fn(PROC);
-__hfi1_trace_fn(SDMA);
-__hfi1_trace_fn(LINKVERB);
-__hfi1_trace_fn(DEBUG);
-__hfi1_trace_fn(SNOOP);
-__hfi1_trace_fn(CNTR);
-__hfi1_trace_fn(PIO);
-__hfi1_trace_fn(DC8051);
-__hfi1_trace_fn(FIRMWARE);
-__hfi1_trace_fn(RCVCTRL);
-__hfi1_trace_fn(TID);
-__hfi1_trace_fn(MMU);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
deleted file mode 100644 (file)
index 963dc94..0000000
+++ /dev/null
@@ -1,1369 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#undef TRACE_SYSTEM_VAR
-#define TRACE_SYSTEM_VAR hfi1
-
-#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __HFI1_TRACE_H
-
-#include <linux/tracepoint.h>
-#include <linux/trace_seq.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "sdma.h"
-
-#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
-#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
-
-#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
-#define show_packettype(etype)                  \
-__print_symbolic(etype,                         \
-       packettype_name(EXPECTED),              \
-       packettype_name(EAGER),                 \
-       packettype_name(IB),                    \
-       packettype_name(ERROR),                 \
-       packettype_name(BYPASS))
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rx
-
-TRACE_EVENT(hfi1_rcvhdr,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    u64 eflags,
-                    u32 ctxt,
-                    u32 etype,
-                    u32 hlen,
-                    u32 tlen,
-                    u32 updegr,
-                    u32 etail
-                    ),
-           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u64, eflags)
-                            __field(u32, ctxt)
-                            __field(u32, etype)
-                            __field(u32, hlen)
-                            __field(u32, tlen)
-                            __field(u32, updegr)
-                            __field(u32, etail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->eflags = eflags;
-                          __entry->ctxt = ctxt;
-                          __entry->etype = etype;
-                          __entry->hlen = hlen;
-                          __entry->tlen = tlen;
-                          __entry->updegr = updegr;
-                          __entry->etail = etail;
-                          ),
-           TP_printk(
-                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->eflags,
-                     __entry->etype, show_packettype(__entry->etype),
-                     __entry->hlen,
-                     __entry->tlen,
-                     __entry->updegr,
-                     __entry->etail
-                     )
-);
-
-TRACE_EVENT(hfi1_receive_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
-           TP_ARGS(dd, ctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, ctxt)
-                            __field(u8, slow_path)
-                            __field(u8, dma_rtail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = ctxt;
-                          if (dd->rcd[ctxt]->do_interrupt ==
-                              &handle_receive_interrupt) {
-                               __entry->slow_path = 1;
-                               __entry->dma_rtail = 0xFF;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_dma_rtail){
-                               __entry->dma_rtail = 1;
-                               __entry->slow_path = 0;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_nodma_rtail) {
-                               __entry->dma_rtail = 0;
-                               __entry->slow_path = 0;
-                          }
-                          ),
-           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->slow_path,
-                     __entry->dma_rtail
-                     )
-);
-
-TRACE_EVENT(hfi1_exp_tid_reg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
-                    u32 npages, unsigned long va, unsigned long pa,
-                    dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_unreg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
-                    unsigned long va, unsigned long pa, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_inval,
-           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
-                    u32 npages, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(unsigned long, va)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->va = va;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_mmu_invalidate,
-           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
-                    unsigned long start, unsigned long end),
-           TP_ARGS(ctxt, subctxt, type, start, end),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __string(type, type)
-                   __field(unsigned long, start)
-                   __field(unsigned long, end)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __assign_str(type, type);
-                   __entry->start = start;
-                   __entry->end = end;
-                   ),
-           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __get_str(type),
-                     __entry->start,
-                     __entry->end
-                   )
-       );
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_tx
-
-TRACE_EVENT(hfi1_piofree,
-           TP_PROTO(struct send_context *sc, int extra),
-           TP_ARGS(sc, extra),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(int, extra)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->extra = extra;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) extra %d",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->extra
-                     )
-);
-
-TRACE_EVENT(hfi1_wantpiointr,
-           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
-           TP_ARGS(sc, needint, credit_ctrl),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(u32, needint)
-                            __field(u64, credit_ctrl)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->needint = needint;
-                          __entry->credit_ctrl = credit_ctrl;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->needint,
-                     (unsigned long long)__entry->credit_ctrl
-                      )
-);
-
-DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 flags),
-                   TP_ARGS(qp, flags),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                           __field(u32, qpn)
-                           __field(u32, flags)
-                           __field(u32, s_flags)
-                           ),
-                   TP_fast_assign(
-                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                           __entry->flags = flags;
-                           __entry->qpn = qp->ibqp.qp_num;
-                           __entry->s_flags = qp->s_flags;
-                           ),
-                   TP_printk(
-                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
-                           __get_str(dev),
-                           __entry->qpn,
-                           __entry->flags,
-                           __entry->s_flags
-                           )
-);
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ibhdrs
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
-const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
-
-#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
-
-const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
-
-#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
-
-#define lrh_name(lrh) { HFI1_##lrh, #lrh }
-#define show_lnh(lrh)                    \
-__print_symbolic(lrh,                    \
-       lrh_name(LRH_BTH),               \
-       lrh_name(LRH_GRH))
-
-#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
-#define show_ib_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-       ib_opcode_name(RC_SEND_FIRST),                     \
-       ib_opcode_name(RC_SEND_MIDDLE),                    \
-       ib_opcode_name(RC_SEND_LAST),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_SEND_ONLY),                      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
-       ib_opcode_name(RC_ACKNOWLEDGE),                    \
-       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
-       ib_opcode_name(RC_COMPARE_SWAP),                   \
-       ib_opcode_name(RC_FETCH_ADD),                      \
-       ib_opcode_name(UC_SEND_FIRST),                     \
-       ib_opcode_name(UC_SEND_MIDDLE),                    \
-       ib_opcode_name(UC_SEND_LAST),                      \
-       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_SEND_ONLY),                      \
-       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(UD_SEND_ONLY),                      \
-       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(CNP))
-
-#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
-#define BTH_PRN \
-       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
-       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
-#define EHDR_PRN "%s"
-
-DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct hfi1_ib_header *hdr),
-                   TP_ARGS(dd, hdr),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd)
-                           /* LRH */
-                           __field(u8, vl)
-                           __field(u8, lver)
-                           __field(u8, sl)
-                           __field(u8, lnh)
-                           __field(u16, dlid)
-                           __field(u16, len)
-                           __field(u16, slid)
-                           /* BTH */
-                           __field(u8, opcode)
-                           __field(u8, se)
-                           __field(u8, m)
-                           __field(u8, pad)
-                           __field(u8, tver)
-                           __field(u16, pkey)
-                           __field(u8, f)
-                           __field(u8, b)
-                           __field(u32, qpn)
-                           __field(u8, a)
-                           __field(u32, psn)
-                           /* extended headers */
-                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
-                           ),
-                   TP_fast_assign(
-                          struct hfi1_other_headers *ohdr;
-
-                          DD_DEV_ASSIGN(dd);
-                          /* LRH */
-                          __entry->vl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
-                          __entry->lver =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
-                          __entry->sl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-                          __entry->lnh =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-                          __entry->dlid =
-                          be16_to_cpu(hdr->lrh[1]);
-                          /* allow for larger len */
-                          __entry->len =
-                          be16_to_cpu(hdr->lrh[2]);
-                          __entry->slid =
-                          be16_to_cpu(hdr->lrh[3]);
-                          /* BTH */
-                          if (__entry->lnh == HFI1_LRH_BTH)
-                               ohdr = &hdr->u.oth;
-                          else
-                               ohdr = &hdr->u.l.oth;
-                         __entry->opcode =
-                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-                         __entry->se =
-                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
-                         __entry->m =
-                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
-                         __entry->pad =
-                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-                         __entry->tver =
-                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
-                         __entry->pkey =
-                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
-                         __entry->f =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
-                         HFI1_FECN_MASK;
-                         __entry->b =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
-                         HFI1_BECN_MASK;
-                         __entry->qpn =
-                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-                         __entry->a =
-                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
-                         /* allow for larger PSN */
-                         __entry->psn =
-                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
-                         /* extended headers */
-                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
-                                ibhdr_exhdr_len(hdr));
-                        ),
-                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
-                             __get_str(dev),
-                             /* LRH */
-                             __entry->vl,
-                             __entry->lver,
-                             __entry->sl,
-                             __entry->lnh, show_lnh(__entry->lnh),
-                             __entry->dlid,
-                             __entry->len,
-                             __entry->slid,
-                             /* BTH */
-                             __entry->opcode, show_ib_opcode(__entry->opcode),
-                             __entry->se,
-                             __entry->m,
-                             __entry->pad,
-                             __entry->tver,
-                             __entry->pkey,
-                             __entry->f,
-                             __entry->b,
-                             __entry->qpn,
-                             __entry->a,
-                             __entry->psn,
-                             /* extended headers */
-                             __parse_ib_ehdrs(
-                                       __entry->opcode,
-                                       (void *)__get_dynamic_array(ehdrs))
-                            )
-);
-
-DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-#define SNOOP_PRN \
-       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
-       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_snoop
-
-TRACE_EVENT(snoop_capture,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    int hdr_len,
-                    struct hfi1_ib_header *hdr,
-                    int data_len,
-                    void *data),
-           TP_ARGS(dd, hdr_len, hdr, data_len, data),
-           TP_STRUCT__entry(
-               DD_DEV_ENTRY(dd)
-               __field(u16, slid)
-               __field(u16, dlid)
-               __field(u32, qpn)
-               __field(u8, opcode)
-               __field(u8, sl)
-               __field(u16, pkey)
-               __field(u32, hdr_len)
-               __field(u32, data_len)
-               __field(u8, lnh)
-               __dynamic_array(u8, raw_hdr, hdr_len)
-               __dynamic_array(u8, raw_pkt, data_len)
-               ),
-           TP_fast_assign(
-               struct hfi1_other_headers *ohdr;
-
-               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-               if (__entry->lnh == HFI1_LRH_BTH)
-                       ohdr = &hdr->u.oth;
-               else
-                       ohdr = &hdr->u.l.oth;
-               DD_DEV_ASSIGN(dd);
-               __entry->slid = be16_to_cpu(hdr->lrh[3]);
-               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
-               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
-               __entry->hdr_len = hdr_len;
-               __entry->data_len = data_len;
-               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
-               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
-               ),
-           TP_printk(
-               "[%s] " SNOOP_PRN,
-               __get_str(dev),
-               __entry->slid,
-               __entry->dlid,
-               __entry->qpn,
-               __entry->opcode,
-               show_ib_opcode(__entry->opcode),
-               __entry->sl,
-               __entry->pkey,
-               __entry->hdr_len,
-               __entry->data_len
-               )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ctxts
-
-#define UCTXT_FMT \
-       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
-       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
-TRACE_EVENT(hfi1_uctxtdata,
-           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
-           TP_ARGS(dd, uctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(u32, credits)
-                            __field(u64, hw_free)
-                            __field(u64, piobase)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u64, rcvhdrq_phys)
-                            __field(u32, eager_cnt)
-                            __field(u64, rcvegr_phys)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = uctxt->ctxt;
-                          __entry->credits = uctxt->sc->credits;
-                          __entry->hw_free = (u64)uctxt->sc->hw_free;
-                          __entry->piobase = (u64)uctxt->sc->base_addr;
-                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
-                          __entry->eager_cnt = uctxt->egrbufs.alloced;
-                          __entry->rcvegr_phys =
-                          uctxt->egrbufs.rcvtids[0].phys;
-                          ),
-           TP_printk("[%s] ctxt %u " UCTXT_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->credits,
-                     __entry->hw_free,
-                     __entry->piobase,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_phys,
-                     __entry->eager_cnt,
-                     __entry->rcvegr_phys
-                     )
-);
-
-#define CINFO_FMT \
-       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
-TRACE_EVENT(hfi1_ctxt_info,
-           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
-                    struct hfi1_ctxt_info cinfo),
-           TP_ARGS(dd, ctxt, subctxt, cinfo),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(unsigned, subctxt)
-                            __field(u16, egrtids)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u16, rcvhdrq_size)
-                            __field(u16, sdma_ring_size)
-                            __field(u32, rcvegr_size)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                           __entry->ctxt = ctxt;
-                           __entry->subctxt = subctxt;
-                           __entry->egrtids = cinfo.egrtids;
-                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
-                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
-                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
-                           __entry->rcvegr_size = cinfo.rcvegr_size;
-                           ),
-           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->egrtids,
-                     __entry->rcvegr_size,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_size,
-                     __entry->sdma_ring_size
-                     )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sma
-
-#define BCT_FORMAT \
-       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
-
-#define BCT(field) \
-       be16_to_cpu( \
-               ((struct buffer_control *)__get_dynamic_array(bct))->field \
-       )
-
-DECLARE_EVENT_CLASS(hfi1_bct_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct buffer_control *bc),
-                   TP_ARGS(dd, bc),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                                    __dynamic_array(u8, bct, sizeof(*bc))
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(dd);
-                                  memcpy(__get_dynamic_array(bct), bc,
-                                         sizeof(*bc));
-                                  ),
-                   TP_printk(BCT_FORMAT,
-                             BCT(overall_shared_limit),
-
-                             BCT(vl[0].dedicated),
-                             BCT(vl[0].shared),
-
-                             BCT(vl[1].dedicated),
-                             BCT(vl[1].shared),
-
-                             BCT(vl[2].dedicated),
-                             BCT(vl[2].shared),
-
-                             BCT(vl[3].dedicated),
-                             BCT(vl[3].shared),
-
-                             BCT(vl[4].dedicated),
-                             BCT(vl[4].shared),
-
-                             BCT(vl[5].dedicated),
-                             BCT(vl[5].shared),
-
-                             BCT(vl[6].dedicated),
-                             BCT(vl[6].shared),
-
-                             BCT(vl[7].dedicated),
-                             BCT(vl[7].shared),
-
-                             BCT(vl[15].dedicated),
-                             BCT(vl[15].shared)
-                             )
-);
-
-DEFINE_EVENT(hfi1_bct_template, bct_set,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-DEFINE_EVENT(hfi1_bct_template, bct_get,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sdma
-
-TRACE_EVENT(hfi1_sdma_descriptor,
-           TP_PROTO(struct sdma_engine *sde,
-                    u64 desc0,
-                    u64 desc1,
-                    u16 e,
-                    void *descp),
-       TP_ARGS(sde, desc0, desc1, e, descp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(void *, descp)
-                        __field(u64, desc0)
-                        __field(u64, desc1)
-                        __field(u16, e)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->desc0 = desc0;
-                      __entry->desc1 = desc1;
-                      __entry->idx = sde->this_idx;
-                      __entry->descp = descp;
-                      __entry->e = e;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
-                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
-                 SDMA_DESC0_PHY_ADDR_MASK,
-                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
-                      SDMA_DESC1_GENERATION_MASK),
-                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
-                       SDMA_DESC0_BYTE_COUNT_MASK),
-                 __entry->desc0,
-                 __entry->desc1,
-                 __entry->descp,
-                 __entry->e
-                 )
-);
-
-TRACE_EVENT(hfi1_sdma_engine_select,
-           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
-           TP_ARGS(dd, sel, vl, idx),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, sel)
-                            __field(u8, vl)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->sel = sel;
-                          __entry->vl = vl;
-                          __entry->idx = idx;
-                          ),
-           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sel,
-                     __entry->vl
-                     )
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
-                   TP_PROTO(struct sdma_engine *sde, u64 status),
-                   TP_ARGS(sde, status),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, status)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->status = status;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) status %llx",
-                             __get_str(dev),
-                             __entry->idx,
-                             (unsigned long long)__entry->status
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
-                   TP_PROTO(struct sdma_engine *sde, int aidx),
-                   TP_ARGS(sde, aidx),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(int, aidx)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->idx = sde->this_idx;
-                                  __entry->aidx = aidx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) aidx %d",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->aidx
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead,
-                    u16 swhead,
-                    struct sdma_txreq *txp
-                    ),
-           TP_ARGS(sde, hwhead, swhead, txp),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __field(u64, sn)
-                            __field(u16, hwhead)
-                            __field(u16, swhead)
-                            __field(u16, txnext)
-                            __field(u16, tx_tail)
-                            __field(u16, tx_head)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                          __entry->hwhead = hwhead;
-                          __entry->swhead = swhead;
-                          __entry->tx_tail = sde->tx_tail;
-                          __entry->tx_head = sde->tx_head;
-                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                          __entry->idx = sde->this_idx;
-                          __entry->sn = txp ? txp->sn : ~0;
-                          ),
-           TP_printk(
-                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sn,
-                     __entry->hwhead,
-                     __entry->swhead,
-                     __entry->txnext,
-                     __entry->tx_head,
-                     __entry->tx_tail
-                     )
-);
-#else
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead, u16 swhead,
-                    struct sdma_txreq *txp
-           ),
-       TP_ARGS(sde, hwhead, swhead, txp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(u16, hwhead)
-                        __field(u16, swhead)
-                        __field(u16, txnext)
-                        __field(u16, tx_tail)
-                        __field(u16, tx_head)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->hwhead = hwhead;
-                      __entry->swhead = swhead;
-                      __entry->tx_tail = sde->tx_tail;
-                      __entry->tx_head = sde->tx_head;
-                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                      __entry->idx = sde->this_idx;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __entry->hwhead,
-                 __entry->swhead,
-                 __entry->txnext,
-                 __entry->tx_head,
-                 __entry->tx_tail
-                 )
-);
-#endif
-
-DECLARE_EVENT_CLASS(hfi1_sdma_sn,
-                   TP_PROTO(struct sdma_engine *sde, u64 sn),
-                   TP_ARGS(sde, sn),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, sn)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->sn = sn;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) sn %llu",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->sn
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
-            TP_PROTO(
-               struct sdma_engine *sde,
-               u64 sn
-            ),
-            TP_ARGS(sde, sn)
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
-            TP_PROTO(struct sdma_engine *sde, u64 sn),
-            TP_ARGS(sde, sn)
-);
-
-#define USDMA_HDR_FORMAT \
-       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
-
-TRACE_EVENT(hfi1_sdma_user_header,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    struct hfi1_pkt_header *hdr, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(__le32, pbc0)
-                   __field(__le32, pbc1)
-                   __field(__be32, lrh0)
-                   __field(__be32, lrh1)
-                   __field(__be32, bth0)
-                   __field(__be32, bth1)
-                   __field(__be32, bth2)
-                   __field(__le32, kdeth0)
-                   __field(__le32, kdeth1)
-                   __field(__le32, kdeth2)
-                   __field(__le32, kdeth3)
-                   __field(__le32, kdeth4)
-                   __field(__le32, kdeth5)
-                   __field(__le32, kdeth6)
-                   __field(__le32, kdeth7)
-                   __field(__le32, kdeth8)
-                   __field(u32, tidval)
-                   ),
-           TP_fast_assign(
-                   __le32 *pbc = (__le32 *)hdr->pbc;
-                   __be32 *lrh = (__be32 *)hdr->lrh;
-                   __be32 *bth = (__be32 *)hdr->bth;
-                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
-
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->pbc0 = pbc[0];
-                   __entry->pbc1 = pbc[1];
-                   __entry->lrh0 = be32_to_cpu(lrh[0]);
-                   __entry->lrh1 = be32_to_cpu(lrh[1]);
-                   __entry->bth0 = be32_to_cpu(bth[0]);
-                   __entry->bth1 = be32_to_cpu(bth[1]);
-                   __entry->bth2 = be32_to_cpu(bth[2]);
-                   __entry->kdeth0 = kdeth[0];
-                   __entry->kdeth1 = kdeth[1];
-                   __entry->kdeth2 = kdeth[2];
-                   __entry->kdeth3 = kdeth[3];
-                   __entry->kdeth4 = kdeth[4];
-                   __entry->kdeth5 = kdeth[5];
-                   __entry->kdeth6 = kdeth[6];
-                   __entry->kdeth7 = kdeth[7];
-                   __entry->kdeth8 = kdeth[8];
-                   __entry->tidval = tidval;
-                   ),
-           TP_printk(USDMA_HDR_FORMAT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->pbc1,
-                     __entry->pbc0,
-                     __entry->lrh0,
-                     __entry->lrh1,
-                     __entry->bth0,
-                     __entry->bth1,
-                     __entry->bth2,
-                     __entry->kdeth0,
-                     __entry->kdeth1,
-                     __entry->kdeth2,
-                     __entry->kdeth3,
-                     __entry->kdeth4,
-                     __entry->kdeth5,
-                     __entry->kdeth6,
-                     __entry->kdeth7,
-                     __entry->kdeth8,
-                     __entry->tidval
-                   )
-       );
-
-#define SDMA_UREQ_FMT \
-       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
-TRACE_EVENT(hfi1_sdma_user_reqinfo,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
-           TP_ARGS(dd, ctxt, subctxt, i),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd);
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u8, ver_opcode)
-                   __field(u8, iovcnt)
-                   __field(u16, npkts)
-                   __field(u16, fragsize)
-                   __field(u16, comp_idx)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->ver_opcode = i[0] & 0xff;
-                   __entry->iovcnt = (i[0] >> 8) & 0xff;
-                   __entry->npkts = i[1];
-                   __entry->fragsize = i[2];
-                   __entry->comp_idx = i[3];
-                   ),
-           TP_printk(SDMA_UREQ_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->ver_opcode,
-                     __entry->iovcnt,
-                     __entry->npkts,
-                     __entry->fragsize,
-                     __entry->comp_idx
-                   )
-       );
-
-#define usdma_complete_name(st) { st, #st }
-#define show_usdma_complete_state(st)                  \
-       __print_symbolic(st,                            \
-                        usdma_complete_name(FREE),     \
-                        usdma_complete_name(QUEUED),   \
-                        usdma_complete_name(COMPLETE), \
-                        usdma_complete_name(ERROR))
-
-TRACE_EVENT(hfi1_sdma_user_completion,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
-                    u8 state, int code),
-           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, idx)
-                   __field(u8, state)
-                   __field(int, code)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->idx = idx;
-                   __entry->state = state;
-                   __entry->code = code;
-                   ),
-           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
-                     __get_str(dev), __entry->ctxt, __entry->subctxt,
-                     __entry->idx, show_usdma_complete_state(__entry->state),
-                     __entry->code)
-       );
-
-const char *print_u32_array(struct trace_seq *, u32 *, int);
-#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
-
-TRACE_EVENT(hfi1_sdma_user_header_ahg,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(u8, sde)
-                   __field(u8, idx)
-                   __field(int, len)
-                   __field(u32, tidval)
-                   __array(u32, ahg, 10)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->sde = sde;
-                   __entry->idx = ahgidx;
-                   __entry->len = len;
-                   __entry->tidval = tidval;
-                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
-                   ),
-           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->sde,
-                     __entry->idx,
-                     __entry->len - 1,
-                     __print_u32_hex(__entry->ahg, __entry->len),
-                     __entry->tidval
-                   )
-       );
-
-TRACE_EVENT(hfi1_sdma_state,
-           TP_PROTO(struct sdma_engine *sde,
-                    const char *cstate,
-                    const char *nstate
-                    ),
-           TP_ARGS(sde, cstate, nstate),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __string(curstate, cstate)
-                            __string(newstate, nstate)
-                            ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __assign_str(curstate, cstate);
-                      __assign_str(newstate, nstate);
-                      ),
-       TP_printk("[%s] current state %s new state %s",
-                 __get_str(dev),
-                 __get_str(curstate),
-                 __get_str(newstate)
-                 )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rc
-
-DECLARE_EVENT_CLASS(hfi1_rc_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 psn),
-                   TP_ARGS(qp, psn),
-                   TP_STRUCT__entry(
-                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                       __field(u32, qpn)
-                       __field(u32, s_flags)
-                       __field(u32, psn)
-                       __field(u32, s_psn)
-                       __field(u32, s_next_psn)
-                       __field(u32, s_sending_psn)
-                       __field(u32, s_sending_hpsn)
-                       __field(u32, r_psn)
-                       ),
-                   TP_fast_assign(
-                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                       __entry->qpn = qp->ibqp.qp_num;
-                       __entry->s_flags = qp->s_flags;
-                       __entry->psn = psn;
-                       __entry->s_psn = qp->s_psn;
-                       __entry->s_next_psn = qp->s_next_psn;
-                       __entry->s_sending_psn = qp->s_sending_psn;
-                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
-                       __entry->r_psn = qp->r_psn;
-                       ),
-                   TP_printk(
-                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
-                       __get_str(dev),
-                       __entry->qpn,
-                       __entry->s_flags,
-                       __entry->psn,
-                       __entry->s_psn,
-                       __entry->s_next_psn,
-                       __entry->s_sending_psn,
-                       __entry->s_sending_hpsn,
-                       __entry->r_psn
-                       )
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_misc
-
-TRACE_EVENT(hfi1_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
-                    int src),
-           TP_ARGS(dd, is_entry, src),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __array(char, buf, 64)
-                            __field(int, src)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd)
-                          is_entry->is_name(__entry->buf, 64,
-                                            src - is_entry->start);
-                          __entry->src = src;
-                          ),
-           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
-                     __entry->src)
-);
-
-/*
- * Note:
- * This produces a REALLY ugly trace in the console output when the string is
- * too long.
- */
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_trace
-
-#define MAX_MSG_LEN 512
-
-DECLARE_EVENT_CLASS(hfi1_trace_template,
-                   TP_PROTO(const char *function, struct va_format *vaf),
-                   TP_ARGS(function, vaf),
-                   TP_STRUCT__entry(__string(function, function)
-                                    __dynamic_array(char, msg, MAX_MSG_LEN)
-                                    ),
-                   TP_fast_assign(__assign_str(function, function);
-                                  WARN_ON_ONCE(vsnprintf
-                                               (__get_dynamic_array(msg),
-                                                MAX_MSG_LEN, vaf->fmt,
-                                                *vaf->va) >=
-                                               MAX_MSG_LEN);
-                                  ),
-                   TP_printk("(%s) %s",
-                             __get_str(function),
-                             __get_str(msg))
-);
-
-/*
- * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
- * actual function to work and can not be in a macro.
- */
-#define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
-                                                                       \
-DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
-       TP_PROTO(const char *function, struct va_format *vaf),          \
-       TP_ARGS(function, vaf))
-
-#define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
-{                                                                      \
-       struct va_format vaf = {                                        \
-               .fmt = fmt,                                             \
-       };                                                              \
-       va_list args;                                                   \
-                                                                       \
-       va_start(args, fmt);                                            \
-       vaf.va = &args;                                                 \
-       trace_hfi1_ ##lvl(func, &vaf);                                  \
-       va_end(args);                                                   \
-       return;                                                         \
-}
-
-/*
- * To create a new trace level simply define it below and as a __hfi1_trace_fn
- * in trace.c. This will create all the hooks for calling
- * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
- * the debugfs stuff.
- */
-__hfi1_trace_def(PKT);
-__hfi1_trace_def(PROC);
-__hfi1_trace_def(SDMA);
-__hfi1_trace_def(LINKVERB);
-__hfi1_trace_def(DEBUG);
-__hfi1_trace_def(SNOOP);
-__hfi1_trace_def(CNTR);
-__hfi1_trace_def(PIO);
-__hfi1_trace_def(DC8051);
-__hfi1_trace_def(FIRMWARE);
-__hfi1_trace_def(RCVCTRL);
-__hfi1_trace_def(TID);
-__hfi1_trace_def(MMU);
-
-#define hfi1_cdbg(which, fmt, ...) \
-       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
-
-#define hfi1_dbg(fmt, ...) \
-       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
-
-/*
- * Define HFI1_EARLY_DBG at compile time or here to enable early trace
- * messages. Do not check in an enablement for this.
- */
-
-#ifdef HFI1_EARLY_DBG
-#define hfi1_dbg_early(fmt, ...) \
-       trace_printk(fmt, ##__VA_ARGS__)
-#else
-#define hfi1_dbg_early(fmt, ...)
-#endif
-
-#endif /* __HFI1_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
deleted file mode 100644 (file)
index e82e52a..0000000
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * "Two Wire Serial Interface" support.
- *
- * Originally written for a not-quite-i2c serial eeprom, which is
- * still used on some supported boards. Later boards have added a
- * variety of other uses, most board-specific, so the bit-boffing
- * part has been split off to this file, while the other parts
- * have been moved to chip-specific files.
- *
- * We have also dropped all pretense of fully generic (e.g. pretend
- * we don't know whether '1' is the higher voltage) interface, as
- * the restrictions of the generic i2c interface (e.g. no access from
- * driver itself) make it unsuitable for this use.
- */
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the hfi1_ib device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
-{
-       /*
-        * implicit read of EXTStatus is as good as explicit
-        * read of scratch, if all we want to do is flush
-        * writes.
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, 0);
-       rmb(); /* inlined, so prevent compiler reordering */
-}
-
-/*
- * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
- * for "almost compliant" modules
- */
-#define SCL_WAIT_USEC 1000
-
-/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
- * Should be 20, but some chips need more.
- */
-#define TWSI_BUF_WAIT_USEC 60
-
-static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       udelay(1);
-
-       mask = QSFP_HFI0_I2CCLK;
-
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       /*
-        * Allow for slow slaves by simple
-        * delay for falling edge, sampling on rise.
-        */
-       if (!bit) {
-               udelay(2);
-       } else {
-               int rise_usec;
-
-               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
-                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
-                               break;
-                       udelay(2);
-               }
-               if (rise_usec <= 0)
-                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
-                                  SCL_WAIT_USEC);
-       }
-       i2c_wait_for_writes(dd, target);
-}
-
-static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CCLK;
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SCL_NUM;
-}
-
-static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       i2c_wait_for_writes(dd, target);
-       udelay(2);
-}
-
-static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SDA_NUM;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the hfi1_ib device
- */
-static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, target, 1);
-       scl_out(dd, target, 1);
-       ack_received = sda_in(dd, target, 1) == 0;
-       scl_out(dd, target, 0);
-       return ack_received;
-}
-
-static void stop_cmd(struct hfi1_devdata *dd, u32 target);
-
-/**
- * rd_byte - read a byte, sending STOP on last, else ACK
- * @dd: the hfi1_ib device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, target, 1);
-               data |= sda_in(dd, target, 0);
-               scl_out(dd, target, 0);
-       }
-       if (last) {
-               scl_out(dd, target, 1);
-               stop_cmd(dd, target);
-       } else {
-               sda_out(dd, target, 0);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-               sda_out(dd, target, 1);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the hfi1_ib device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, target, bit);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-       }
-       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
-}
-
-/*
- * issue TWSI start sequence:
- * (both clock/data high, clock high, data low while clock is high)
- */
-static void start_seq(struct hfi1_devdata *dd, u32 target)
-{
-       sda_out(dd, target, 1);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 0);
-       udelay(1);
-       scl_out(dd, target, 0);
-}
-
-/**
- * stop_seq - transmit the stop sequence
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_seq(struct hfi1_devdata *dd, u32 target)
-{
-       scl_out(dd, target, 0);
-       sda_out(dd, target, 0);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 1);
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct hfi1_devdata *dd, u32 target)
-{
-       stop_seq(dd, target);
-       udelay(TWSI_BUF_WAIT_USEC);
-}
-
-/**
- * hfi1_twsi_reset - reset I2C communication
- * @dd: the hfi1_ib device
- * returns 0 if ok, -EIO on error
- */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
-{
-       int clock_cycles_left = 9;
-       u32 mask;
-
-       /* Both SCL and SDA should be high. If not, there
-        * is something wrong.
-        */
-       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
-
-       /*
-        * Force pins to desired innocuous state.
-        * This is the default power-on state with out=0 and dir=0,
-        * So tri-stated and should be floating high (barring HW problems)
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-
-       /* Check if SCL is low, if it is low then we have a slave device
-        * misbehaving and there is not much we can do.
-        */
-       if (!scl_in(dd, target, 0))
-               return -EIO;
-
-       /* Check if SDA is low, if it is low then we have to clock SDA
-        * up to 9 times for the device to release the bus
-        */
-       while (clock_cycles_left--) {
-               if (sda_in(dd, target, 0))
-                       return 0;
-               scl_out(dd, target, 0);
-               scl_out(dd, target, 1);
-       }
-
-       return -EIO;
-}
-
-#define HFI1_TWSI_START 0x100
-#define HFI1_TWSI_STOP 0x200
-
-/* Write byte to TWSI, optionally prefixed with START or suffixed with
- * STOP.
- * returns 0 if OK (ACK received), else != 0
- */
-static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
-{
-       int ret = 1;
-
-       if (flags & HFI1_TWSI_START)
-               start_seq(dd, target);
-
-       /* Leaves SCL low (from i2c_ackrcv()) */
-       ret = wr_byte(dd, target, data);
-
-       if (flags & HFI1_TWSI_STOP)
-               stop_cmd(dd, target);
-       return ret;
-}
-
-/* Added functionality for IBA7220-based cards */
-#define HFI1_TEMP_DEV 0x98
-
-/*
- * hfi1_twsi_blk_rd
- * General interface for data transfer from twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device from which data should
- * be read.
- */
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len)
-{
-       u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               /* legacy not-really-I2C */
-               addr = (addr << 1) | READ_CMD;
-               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
-       } else {
-               /* Actual I2C */
-               if (offset_size) {
-                       ret = twsi_wr(dd, target,
-                                     dev | WRITE_CMD, HFI1_TWSI_START);
-                       if (ret) {
-                               stop_cmd(dd, target);
-                               goto bail;
-                       }
-
-                       for (i = 0; i < offset_size; i++) {
-                               ret = twsi_wr(dd, target,
-                                             (addr >> (i * 8)) & 0xff, 0);
-                               udelay(TWSI_BUF_WAIT_USEC);
-                               if (ret) {
-                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                                  i, addr);
-                                       goto bail;
-                               }
-                       }
-               }
-               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
-       }
-       if (ret) {
-               stop_cmd(dd, target);
-               goto bail;
-       }
-
-       /*
-        * block devices keeps clocking data out as long as we ack,
-        * automatically incrementing the address. Some have "pages"
-        * whose boundaries will not be crossed, but the handling
-        * of these is left to the caller, who is in a better
-        * position to know.
-        */
-       while (len-- > 0) {
-               /*
-                * Get and store data, sending ACK if length remaining,
-                * else STOP
-                */
-               *bp++ = rd_byte(dd, target, !len);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * hfi1_twsi_blk_wr
- * General interface for data transfer to twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device to which data should
- * be written.
- */
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len)
-{
-       const u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
-                           HFI1_TWSI_START)) {
-                       goto failed_write;
-               }
-       } else {
-               /* Real I2C */
-               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
-                       goto failed_write;
-       }
-
-       for (i = 0; i < offset_size; i++) {
-               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
-               udelay(TWSI_BUF_WAIT_USEC);
-               if (ret) {
-                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                  i, addr);
-                       goto bail;
-               }
-       }
-
-       for (i = 0; i < len; i++)
-               if (twsi_wr(dd, target, *bp++, 0))
-                       goto failed_write;
-
-       ret = 0;
-
-failed_write:
-       stop_cmd(dd, target);
-
-bail:
-       return ret;
-}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
deleted file mode 100644 (file)
index 5b8a5b5..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _TWSI_H
-#define _TWSI_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define HFI1_TWSI_NO_DEV 0xFF
-
-struct hfi1_devdata;
-
-/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
-#define  GPIO_SDA_NUM 1
-#define  GPIO_SCL_NUM 0
-
-/* these functions must be called with qsfp_lock held */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len);
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len);
-
-#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
deleted file mode 100644 (file)
index df773d4..0000000
+++ /dev/null
@@ -1,604 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_UC_##x
-
-/* only opcode mask for adaptive pio */
-const u32 uc_only_opcode =
-       BIT(OP(SEND_ONLY) & 0x1f) |
-       BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
-       BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
-
-/**
- * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
- * @qp: a pointer to the QP
- *
- * Assume s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct rvt_swqe *wqe;
-       u32 hwords = 5;
-       u32 bth0 = 0;
-       u32 len;
-       u32 pmtu = qp->pmtu;
-       int middle = 0;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               clear_ahg(qp);
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done_free_tx;
-       }
-
-       ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-
-       /* Get the next send request. */
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       qp->s_wqe = NULL;
-       switch (qp->s_state) {
-       default:
-               if (!(ib_rvt_state_ops[qp->state] &
-                   RVT_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /* Check if send work queue is empty. */
-               smp_read_barrier_depends(); /* see post_one_send() */
-               if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
-                       clear_ahg(qp);
-                       goto bail;
-               }
-               /*
-                * Start a new request.
-                */
-               qp->s_psn = wqe->psn;
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_sge.total_len = wqe->length;
-               len = wqe->length;
-               qp->s_len = len;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND) {
-                               qp->s_state = OP(SEND_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->rdma_wr.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->rdma_wr.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / 4;
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       } else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= IB_BTH_SOLICITED;
-                       }
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               break;
-
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND) {
-                       qp->s_state = OP(SEND_LAST);
-               } else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= IB_BTH_SOLICITED;
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               } else {
-                       qp->s_state =
-                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= IB_BTH_SOLICITED;
-               }
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       ps->s_txreq->sde = priv->s_sde;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_cur_size = len;
-       hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
-                            mask_psn(qp->s_psn++), middle, ps);
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/**
- * hfi1_uc_rcv - handle an incoming UC packet
- * @ibp: the port the packet came in on
- * @hdr: the header of the packet
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the length of the packet
- * @qp: the QP for this packet.
- *
- * This is called from qp_rcv() to process an incoming UC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void hfi1_uc_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       u32 bth0, opcode;
-       u32 hdrsize = packet->hlen;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = qp->pmtu;
-       struct ib_reth *reth;
-       int has_grh = rcv_flags & HFI1_HAS_GRH;
-       int ret;
-       u32 bth1;
-
-       bth0 = be32_to_cpu(ohdr->bth[0]);
-       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
-               return;
-
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u32 rqpn, lqpn;
-                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
-                       u8 sl, sc5;
-
-                       lqpn = bth1 & RVT_QPN_MASK;
-                       rqpn = qp->remote_qpn;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       sl = ibp->sc_to_sl[sc5];
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn,
-                                    IB_CC_SVCTYPE_UC);
-               }
-
-               if (bth1 & HFI1_FECN_SMASK) {
-                       struct ib_grh *grh = NULL;
-                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-                       u16 slid = be16_to_cpu(hdr->lrh[3]);
-                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
-                       u32 src_qp = qp->remote_qpn;
-                       u8 sc5;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       if (has_grh)
-                               grh = &hdr->u.l.grh;
-
-                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
-                                  grh);
-               }
-       }
-
-       psn = be32_to_cpu(ohdr->bth[2]);
-       opcode = (bth0 >> 24) & 0xff;
-
-       /* Compare the PSN verses the expected PSN. */
-       if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
-               /*
-                * Handle a sequence error.
-                * Silently drop any current message.
-                */
-               qp->r_psn = psn;
-inv:
-               if (qp->r_state == OP(SEND_FIRST) ||
-                   qp->r_state == OP(SEND_MIDDLE)) {
-                       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
-                       qp->r_sge.num_sge = 0;
-               } else {
-                       rvt_put_ss(&qp->r_sge);
-               }
-               qp->r_state = OP(SEND_LAST);
-               switch (opcode) {
-               case OP(SEND_FIRST):
-               case OP(SEND_ONLY):
-               case OP(SEND_ONLY_WITH_IMMEDIATE):
-                       goto send_first;
-
-               case OP(RDMA_WRITE_FIRST):
-               case OP(RDMA_WRITE_ONLY):
-               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-                       goto rdma_first;
-
-               default:
-                       goto drop;
-               }
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       default:
-               if (opcode == OP(SEND_FIRST) ||
-                   opcode == OP(SEND_ONLY) ||
-                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_FIRST) ||
-                   opcode == OP(RDMA_WRITE_ONLY) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-       }
-
-       if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
-               qp_comm_est(qp);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-send_first:
-               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
-                       qp->r_sge = qp->s_rdma_read_sge;
-               } else {
-                       ret = hfi1_rvt_get_rwqe(qp, 0);
-                       if (ret < 0)
-                               goto op_err;
-                       if (!ret)
-                               goto drop;
-                       /*
-                        * qp->s_rdma_read_sge will be the owner
-                        * of the mr references.
-                        */
-                       qp->s_rdma_read_sge = qp->r_sge;
-               }
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto no_immediate_data;
-               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
-                       goto send_last_imm;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto rewind;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto rewind;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0);
-               break;
-
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-send_last_imm:
-               wc.ex.imm_data = ohdr->u.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               goto send_last;
-       case OP(SEND_LAST):
-no_immediate_data:
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto rewind;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto rewind;
-               wc.opcode = IB_WC_RECV;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0);
-               rvt_put_ss(&qp->s_rdma_read_sge);
-last_imm:
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               /*
-                * It seems that IB mandates the presence of an SL in a
-                * work completion only for the UD transport (see section
-                * 11.4.2 of IBTA Vol. 1).
-                *
-                * However, the way the SL is chosen below is consistent
-                * with the way that IB/qib works and is trying avoid
-                * introducing incompatibilities.
-                *
-                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
-                */
-               wc.sl = qp->remote_ah_attr.sl;
-               /* zero fields that are N/A */
-               wc.vendor_err = 0;
-               wc.pkey_index = 0;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
-               /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            (ohdr->bth[0] &
-                             cpu_to_be32(IB_BTH_SOLICITED)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
-rdma_first:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       goto drop;
-               }
-               reth = &ohdr->u.rc.reth;
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               qp->r_sge.sg_list = NULL;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey */
-                       ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
-                                        vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto drop;
-                       qp->r_sge.num_sge = 1;
-               } else {
-                       qp->r_sge.num_sge = 0;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_ONLY)) {
-                       goto rdma_last;
-               } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
-                       wc.ex.imm_data = ohdr->u.rc.imm_data;
-                       goto rdma_last_imm;
-               }
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto drop;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto drop;
-               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               wc.ex.imm_data = ohdr->u.imm_data;
-rdma_last_imm:
-               wc.wc_flags = IB_WC_WITH_IMM;
-
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto drop;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
-                       goto drop;
-               if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
-                       rvt_put_ss(&qp->s_rdma_read_sge);
-               } else {
-                       ret = hfi1_rvt_get_rwqe(qp, 1);
-                       if (ret < 0)
-                               goto op_err;
-                       if (!ret)
-                               goto drop;
-               }
-               wc.byte_len = qp->r_len;
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
-               rvt_put_ss(&qp->r_sge);
-               goto last_imm;
-
-       case OP(RDMA_WRITE_LAST):
-rdma_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto drop;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
-                       goto drop;
-               hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
-               rvt_put_ss(&qp->r_sge);
-               break;
-
-       default:
-               /* Drop packet for unknown opcodes. */
-               goto drop;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       return;
-
-rewind:
-       set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
-       qp->r_sge.num_sge = 0;
-drop:
-       ibp->rvp.n_pkt_drops++;
-       return;
-
-op_err:
-       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
deleted file mode 100644 (file)
index 1e503ad..0000000
+++ /dev/null
@@ -1,911 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/net.h>
-#include <rdma/ib_smi.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-
-/**
- * ud_loopback - handle send on loopback QPs
- * @sqp: the sending QP
- * @swqe: the send work request
- *
- * This is called from hfi1_make_ud_req() to forward a WQE addressed
- * to the same HFI.
- * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
- * while this is being called.
- */
-static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
-{
-       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-       struct hfi1_pportdata *ppd;
-       struct rvt_qp *qp;
-       struct ib_ah_attr *ah_attr;
-       unsigned long flags;
-       struct rvt_sge_state ssge;
-       struct rvt_sge *sge;
-       struct ib_wc wc;
-       u32 length;
-       enum ib_qp_type sqptype, dqptype;
-
-       rcu_read_lock();
-
-       qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-                           swqe->ud_wr.remote_qpn);
-       if (!qp) {
-               ibp->rvp.n_pkt_drops++;
-               rcu_read_unlock();
-               return;
-       }
-
-       sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
-                       IB_QPT_UD : sqp->ibqp.qp_type;
-       dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
-                       IB_QPT_UD : qp->ibqp.qp_type;
-
-       if (dqptype != sqptype ||
-           !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
-               ibp->rvp.n_pkt_drops++;
-               goto drop;
-       }
-
-       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
-       ppd = ppd_from_ibp(ibp);
-
-       if (qp->ibqp.qp_num > 1) {
-               u16 pkey;
-               u16 slid;
-               u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
-
-               pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
-               slid = ppd->lid | (ah_attr->src_path_bits &
-                                  ((1 << ppd->lmc) - 1));
-               if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
-                                               qp->s_pkey_index, slid))) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
-                                      ah_attr->sl,
-                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
-                                      slid, ah_attr->dlid);
-                       goto drop;
-               }
-       }
-
-       /*
-        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       if (qp->ibqp.qp_num) {
-               u32 qkey;
-
-               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-                       sqp->qkey : swqe->ud_wr.remote_qkey;
-               if (unlikely(qkey != qp->qkey)) {
-                       u16 lid;
-
-                       lid = ppd->lid | (ah_attr->src_path_bits &
-                                         ((1 << ppd->lmc) - 1));
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      ah_attr->sl,
-                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
-                                      lid,
-                                      ah_attr->dlid);
-                       goto drop;
-               }
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       length = swqe->length;
-       memset(&wc, 0, sizeof(wc));
-       wc.byte_len = length + sizeof(struct ib_grh);
-
-       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = swqe->wr.ex.imm_data;
-       }
-
-       spin_lock_irqsave(&qp->r_lock, flags);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & RVT_R_REUSE_SGE) {
-               qp->r_flags &= ~RVT_R_REUSE_SGE;
-       } else {
-               int ret;
-
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0) {
-                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-                       goto bail_unlock;
-               }
-               if (!ret) {
-                       if (qp->ibqp.qp_num == 0)
-                               ibp->rvp.n_vl15_dropped++;
-                       goto bail_unlock;
-               }
-       }
-       /* Silently drop packets which are too big. */
-       if (unlikely(wc.byte_len > qp->r_len)) {
-               qp->r_flags |= RVT_R_REUSE_SGE;
-               ibp->rvp.n_pkt_drops++;
-               goto bail_unlock;
-       }
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
-                             sizeof(struct ib_grh), 1, 0);
-               wc.wc_flags |= IB_WC_GRH;
-       } else {
-               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
-       }
-       ssge.sg_list = swqe->sg_list + 1;
-       ssge.sge = *swqe->sg_list;
-       ssge.num_sge = swqe->wr.num_sge;
-       sge = &ssge.sge;
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ssge.num_sge)
-                               *sge = *ssge.sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-       rvt_put_ss(&qp->r_sge);
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               goto bail_unlock;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = sqp->ibqp.qp_num;
-       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
-               if (sqp->ibqp.qp_type == IB_QPT_GSI ||
-                   sqp->ibqp.qp_type == IB_QPT_SMI)
-                       wc.pkey_index = swqe->ud_wr.pkey_index;
-               else
-                       wc.pkey_index = sqp->s_pkey_index;
-       } else {
-               wc.pkey_index = 0;
-       }
-       wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
-       /* Check for loopback when the port lid is not set */
-       if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
-               wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
-       wc.sl = ah_attr->sl;
-       wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
-       wc.port_num = qp->port_num;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    swqe->wr.send_flags & IB_SEND_SOLICITED);
-       ibp->rvp.n_loop_pkts++;
-bail_unlock:
-       spin_unlock_irqrestore(&qp->r_lock, flags);
-drop:
-       rcu_read_unlock();
-}
-
-/**
- * hfi1_make_ud_req - construct a UD request packet
- * @qp: the QP
- *
- * Assume s_lock is held.
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct ib_ah_attr *ah_attr;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_ibport *ibp;
-       struct rvt_swqe *wqe;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth0;
-       u16 lrh0;
-       u16 lid;
-       int next_cur;
-       u8 sc5;
-
-       ps->s_txreq = get_txreq(ps->dev, qp);
-       if (IS_ERR(ps->s_txreq))
-               goto bail_no_tx;
-
-       if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               smp_read_barrier_depends(); /* see post_one_send */
-               if (qp->s_last == ACCESS_ONCE(qp->s_head))
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (iowait_sdma_pending(&priv->s_iowait)) {
-                       qp->s_flags |= RVT_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done_free_tx;
-       }
-
-       /* see post_one_send() */
-       smp_read_barrier_depends();
-       if (qp->s_cur == ACCESS_ONCE(qp->s_head))
-               goto bail;
-
-       wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
-       next_cur = qp->s_cur + 1;
-       if (next_cur >= qp->s_size)
-               next_cur = 0;
-
-       /* Construct the header. */
-       ibp = to_iport(qp->ibqp.device, qp->port_num);
-       ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
-       if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
-           ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
-               lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
-               if (unlikely(!loopback &&
-                            (lid == ppd->lid ||
-                             (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
-                             qp->ibqp.qp_type == IB_QPT_GSI)))) {
-                       unsigned long tflags = ps->flags;
-                       /*
-                        * If DMAs are in progress, we can't generate
-                        * a completion for the loopback packet since
-                        * it would be out of order.
-                        * Instead of waiting, we could queue a
-                        * zero length descriptor so we get a callback.
-                        */
-                       if (iowait_sdma_pending(&priv->s_iowait)) {
-                               qp->s_flags |= RVT_S_WAIT_DMA;
-                               goto bail;
-                       }
-                       qp->s_cur = next_cur;
-                       spin_unlock_irqrestore(&qp->s_lock, tflags);
-                       ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, tflags);
-                       ps->flags = tflags;
-                       hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
-                       goto done_free_tx;
-               }
-       }
-
-       qp->s_cur = next_cur;
-       extra_bytes = -wqe->length & 3;
-       nwords = (wqe->length + extra_bytes) >> 2;
-
-       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-       qp->s_hdrwords = 7;
-       qp->s_cur_size = wqe->length;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_srate = ah_attr->static_rate;
-       qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
-       qp->s_wqe = wqe;
-       qp->s_sge.sge = wqe->sg_list[0];
-       qp->s_sge.sg_list = wqe->sg_list + 1;
-       qp->s_sge.num_sge = wqe->wr.num_sge;
-       qp->s_sge.total_len = wqe->length;
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               /* Header size in 32-bit words. */
-               qp->s_hdrwords += hfi1_make_grh(ibp,
-                                               &ps->s_txreq->phdr.hdr.u.l.grh,
-                                               &ah_attr->grh,
-                                               qp->s_hdrwords, nwords);
-               lrh0 = HFI1_LRH_GRH;
-               ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
-               /*
-                * Don't worry about sending to locally attached multicast
-                * QPs.  It is unspecified by the spec. what happens.
-                */
-       } else {
-               /* Header size in 32-bit words. */
-               lrh0 = HFI1_LRH_BTH;
-               ohdr = &ps->s_txreq->phdr.hdr.u.oth;
-       }
-       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               qp->s_hdrwords++;
-               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
-               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-       } else {
-               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-       }
-       sc5 = ibp->sl_to_sc[ah_attr->sl];
-       lrh0 |= (ah_attr->sl & 0xf) << 4;
-       if (qp->ibqp.qp_type == IB_QPT_SMI) {
-               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
-               priv->s_sc = 0xf;
-       } else {
-               lrh0 |= (sc5 & 0xf) << 12;
-               priv->s_sc = sc5;
-       }
-       priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
-       ps->s_txreq->sde = priv->s_sde;
-       priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
-       ps->s_txreq->psc = priv->s_sendcontext;
-       ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
-       ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);
-       ps->s_txreq->phdr.hdr.lrh[2] =
-               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
-               ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
-       } else {
-               lid = ppd->lid;
-               if (lid) {
-                       lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
-                       ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid);
-               } else {
-                       ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
-               }
-       }
-       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-               bth0 |= IB_BTH_SOLICITED;
-       bth0 |= extra_bytes << 20;
-       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-               bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
-       else
-               bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
-       /*
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-                                        qp->qkey : wqe->ud_wr.remote_qkey);
-       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-       /* disarm any ahg */
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->sde = NULL;
-       /* pbc */
-       ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
-
-       return 1;
-
-done_free_tx:
-       hfi1_put_txreq(ps->s_txreq);
-       ps->s_txreq = NULL;
-       return 1;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-
-bail_no_tx:
-       ps->s_txreq = NULL;
-       qp->s_flags &= ~RVT_S_BUSY;
-       qp->s_hdrwords = 0;
-       return 0;
-}
-
-/*
- * Hardware can't check this so we do it here.
- *
- * This is a slightly different algorithm than the standard pkey check.  It
- * special cases the management keys and allows for 0x7fff and 0xffff to be in
- * the table at the same time.
- *
- * @returns the index found or -1 if not found
- */
-int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       unsigned i;
-
-       if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
-               unsigned lim_idx = -1;
-
-               for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
-                       /* here we look for an exact match */
-                       if (ppd->pkeys[i] == pkey)
-                               return i;
-                       if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
-                               lim_idx = i;
-               }
-
-               /* did not find 0xffff return 0x7fff idx if found */
-               if (pkey == FULL_MGMT_P_KEY)
-                       return lim_idx;
-
-               /* no match...  */
-               return -1;
-       }
-
-       pkey &= 0x7fff; /* remove limited/full membership bit */
-
-       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
-               if ((ppd->pkeys[i] & 0x7fff) == pkey)
-                       return i;
-
-       /*
-        * Should not get here, this means hardware failed to validate pkeys.
-        */
-       return -1;
-}
-
-void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
-               u32 pkey, u32 slid, u32 dlid, u8 sc5,
-               const struct ib_grh *old_grh)
-{
-       u64 pbc, pbc_flags = 0;
-       u32 bth0, plen, vl, hwords = 5;
-       u16 lrh0;
-       u8 sl = ibp->sc_to_sl[sc5];
-       struct hfi1_ib_header hdr;
-       struct hfi1_other_headers *ohdr;
-       struct pio_buf *pbuf;
-       struct send_context *ctxt = qp_to_send_context(qp, sc5);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       if (old_grh) {
-               struct ib_grh *grh = &hdr.u.l.grh;
-
-               grh->version_tclass_flow = old_grh->version_tclass_flow;
-               grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
-               grh->hop_limit = 0xff;
-               grh->sgid = old_grh->dgid;
-               grh->dgid = old_grh->sgid;
-               ohdr = &hdr.u.l.oth;
-               lrh0 = HFI1_LRH_GRH;
-               hwords += sizeof(struct ib_grh) / sizeof(u32);
-       } else {
-               ohdr = &hdr.u.oth;
-               lrh0 = HFI1_LRH_BTH;
-       }
-
-       lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
-
-       bth0 = pkey | (IB_OPCODE_CNP << 24);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-
-       ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
-       ohdr->bth[2] = 0; /* PSN 0 */
-
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(slid);
-
-       plen = 2 /* PBC */ + hwords;
-       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-       vl = sc_to_vlt(ppd->dd, sc5);
-       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-       if (ctxt) {
-               pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
-               if (pbuf)
-                       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
-                                                &hdr, hwords);
-       }
-}
-
-/*
- * opa_smp_check() - Do the regular pkey checking, and the additional
- * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
- * ("SMA Packet Checks").
- *
- * Note that:
- *   - Checks are done using the pkey directly from the packet's BTH,
- *     and specifically _not_ the pkey that we attach to the completion,
- *     which may be different.
- *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
- *     which originated on another node). SMPs which are sent from, and
- *     destined to this node are checked in opa_local_smp_check().
- *
- * At the point where opa_smp_check() is called, we know:
- *   - destination QP is QP0
- *
- * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
- */
-static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
-                        struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
-{
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       /*
-        * I don't think it's possible for us to get here with sc != 0xf,
-        * but check it to be certain.
-        */
-       if (sc5 != 0xf)
-               return 1;
-
-       if (rcv_pkey_check(ppd, pkey, sc5, slid))
-               return 1;
-
-       /*
-        * At this point we know (and so don't need to check again) that
-        * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
-        * (see ingress_pkey_check).
-        */
-       if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
-           smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
-               ingress_pkey_table_fail(ppd, pkey, slid);
-               return 1;
-       }
-
-       /*
-        * SMPs fall into one of four (disjoint) categories:
-        * SMA request, SMA response, trap, or trap repress.
-        * Our response depends, in part, on which type of
-        * SMP we're processing.
-        *
-        * If this is not an SMA request, or trap repress:
-        *   - accept MAD if the port is running an SM
-        *   - pkey == FULL_MGMT_P_KEY =>
-        *       reply with unsupported method (i.e., just mark
-        *       the smp's status field here, and let it be
-        *       processed normally)
-        *   - pkey != LIM_MGMT_P_KEY =>
-        *       increment port recv constraint errors, drop MAD
-        * If this is an SMA request or trap repress:
-        *   - pkey != FULL_MGMT_P_KEY =>
-        *       increment port recv constraint errors, drop MAD
-        */
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-       case IB_MGMT_METHOD_SET:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_TRAP_REPRESS:
-               if (pkey != FULL_MGMT_P_KEY) {
-                       ingress_pkey_table_fail(ppd, pkey, slid);
-                       return 1;
-               }
-               break;
-       case IB_MGMT_METHOD_SEND:
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_GET_RESP:
-       case IB_MGMT_METHOD_REPORT_RESP:
-               if (ibp->rvp.port_cap_flags & IB_PORT_SM)
-                       return 0;
-               if (pkey == FULL_MGMT_P_KEY) {
-                       smp->status |= IB_SMP_UNSUP_METHOD;
-                       return 0;
-               }
-               if (pkey != LIM_MGMT_P_KEY) {
-                       ingress_pkey_table_fail(ppd, pkey, slid);
-                       return 1;
-               }
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/**
- * hfi1_ud_rcv - receive an incoming UD packet
- * @ibp: the port the packet came in on
- * @hdr: the packet header
- * @rcv_flags: flags relevant to rcv processing
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from qp_rcv() to process an incoming UD packet
- * for the given QP.
- * Called at interrupt level.
- */
-void hfi1_ud_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_other_headers *ohdr = packet->ohdr;
-       int opcode;
-       u32 hdrsize = packet->hlen;
-       u32 pad;
-       struct ib_wc wc;
-       u32 qkey;
-       u32 src_qp;
-       u16 dlid, pkey;
-       int mgmt_pkey_idx = -1;
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 rcv_flags = packet->rcv_flags;
-       void *data = packet->ebuf;
-       u32 tlen = packet->tlen;
-       struct rvt_qp *qp = packet->qp;
-       bool has_grh = rcv_flags & HFI1_HAS_GRH;
-       bool sc4_bit = has_sc4_bit(packet);
-       u8 sc;
-       u32 bth1;
-       int is_mcast;
-       struct ib_grh *grh = NULL;
-
-       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
-       dlid = be16_to_cpu(hdr->lrh[1]);
-       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
-               /*
-                * In pre-B0 h/w the CNP_OPCODE is handled via an
-                * error path.
-                */
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               u8 sl, sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-               sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
-       }
-
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       opcode &= 0xff;
-
-       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-
-       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-               u8 sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-
-               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
-       }
-       /*
-        * Get the number of bytes the message was padded by
-        * and drop incomplete packets.
-        */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4)))
-               goto drop;
-
-       tlen -= hdrsize + pad + 4;
-
-       /*
-        * Check that the permissive LID is only used on QP0
-        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
-        */
-       if (qp->ibqp.qp_num) {
-               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                            hdr->lrh[3] == IB_LID_PERMISSIVE))
-                       goto drop;
-               if (qp->ibqp.qp_num > 1) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u16 slid;
-                       u8 sc5;
-
-                       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-                       sc5 |= sc4_bit;
-
-                       slid = be16_to_cpu(hdr->lrh[3]);
-                       if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
-                               /*
-                                * Traps will not be sent for packets dropped
-                                * by the HW. This is fine, as sending trap
-                                * for invalid pkeys is optional according to
-                                * IB spec (release 1.3, section 10.9.4)
-                                */
-                               hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                              pkey,
-                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
-                                               0xF,
-                                              src_qp, qp->ibqp.qp_num,
-                                              be16_to_cpu(hdr->lrh[3]),
-                                              be16_to_cpu(hdr->lrh[1]));
-                               return;
-                       }
-               } else {
-                       /* GSI packet */
-                       mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
-                       if (mgmt_pkey_idx < 0)
-                               goto drop;
-               }
-               if (unlikely(qkey != qp->qkey)) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
-                                      src_qp, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
-                       return;
-               }
-               /* Drop invalid MAD packets (see 13.5.3.1). */
-               if (unlikely(qp->ibqp.qp_num == 1 &&
-                            (tlen > 2048 ||
-                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
-                       goto drop;
-       } else {
-               /* Received on QP0, and so by definition, this is an SMP */
-               struct opa_smp *smp = (struct opa_smp *)data;
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-               u8 sc5;
-
-               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-               sc5 |= sc4_bit;
-
-               if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
-                       goto drop;
-
-               if (tlen > 2048)
-                       goto drop;
-               if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                    hdr->lrh[3] == IB_LID_PERMISSIVE) &&
-                   smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-                       goto drop;
-
-               /* look up SMI pkey */
-               mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
-               if (mgmt_pkey_idx < 0)
-                       goto drop;
-       }
-
-       if (qp->ibqp.qp_num > 1 &&
-           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
-               wc.ex.imm_data = ohdr->u.ud.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               tlen -= sizeof(u32);
-       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-       } else {
-               goto drop;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       wc.byte_len = tlen + sizeof(struct ib_grh);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & RVT_R_REUSE_SGE) {
-               qp->r_flags &= ~RVT_R_REUSE_SGE;
-       } else {
-               int ret;
-
-               ret = hfi1_rvt_get_rwqe(qp, 0);
-               if (ret < 0) {
-                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-                       return;
-               }
-               if (!ret) {
-                       if (qp->ibqp.qp_num == 0)
-                               ibp->rvp.n_vl15_dropped++;
-                       return;
-               }
-       }
-       /* Silently drop packets which are too big. */
-       if (unlikely(wc.byte_len > qp->r_len)) {
-               qp->r_flags |= RVT_R_REUSE_SGE;
-               goto drop;
-       }
-       if (has_grh) {
-               hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-                             sizeof(struct ib_grh), 1, 0);
-               wc.wc_flags |= IB_WC_GRH;
-       } else {
-               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
-       }
-       hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
-                     1, 0);
-       rvt_put_ss(&qp->r_sge);
-       if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-               return;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = src_qp;
-
-       if (qp->ibqp.qp_type == IB_QPT_GSI ||
-           qp->ibqp.qp_type == IB_QPT_SMI) {
-               if (mgmt_pkey_idx < 0) {
-                       if (net_ratelimit()) {
-                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                               struct hfi1_devdata *dd = ppd->dd;
-
-                               dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
-                                          qp->ibqp.qp_type);
-                               mgmt_pkey_idx = 0;
-                       }
-               }
-               wc.pkey_index = (unsigned)mgmt_pkey_idx;
-       } else {
-               wc.pkey_index = 0;
-       }
-
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc |= sc4_bit;
-       wc.sl = ibp->sc_to_sl[sc];
-
-       /*
-        * Save the LMC lower bits if the destination LID is a unicast LID.
-        */
-       wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
-               dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
-       wc.port_num = qp->port_num;
-       /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    (ohdr->bth[0] &
-                     cpu_to_be32(IB_BTH_SOLICITED)) != 0);
-       return;
-
-drop:
-       ibp->rvp.n_pkt_drops++;
-}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
deleted file mode 100644 (file)
index 1b640a3..0000000
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <asm/page.h>
-
-#include "user_exp_rcv.h"
-#include "trace.h"
-#include "mmu_rb.h"
-
-struct tid_group {
-       struct list_head list;
-       unsigned base;
-       u8 size;
-       u8 used;
-       u8 map;
-};
-
-struct tid_rb_node {
-       struct mmu_rb_node mmu;
-       unsigned long phys;
-       struct tid_group *grp;
-       u32 rcventry;
-       dma_addr_t dma_addr;
-       bool freed;
-       unsigned npages;
-       struct page *pages[0];
-};
-
-struct tid_pageset {
-       u16 idx;
-       u16 count;
-};
-
-#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
-
-#define num_user_pages(vaddr, len)                                    \
-       (1 + (((((unsigned long)(vaddr) +                              \
-                (unsigned long)(len) - 1) & PAGE_MASK) -              \
-              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
-
-static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
-                           struct rb_root *);
-static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
-static int set_rcvarray_entry(struct file *, unsigned long, u32,
-                             struct tid_group *, struct page **, unsigned);
-static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                         struct mm_struct *);
-static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
-static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
-                           struct tid_pageset *, unsigned, u16, struct page **,
-                           u32 *, unsigned *, unsigned *);
-static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
-static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
-
-static struct mmu_rb_ops tid_rb_ops = {
-       .insert = mmu_rb_insert,
-       .remove = mmu_rb_remove,
-       .invalidate = mmu_rb_invalidate
-};
-
-static inline u32 rcventry2tidinfo(u32 rcventry)
-{
-       u32 pair = rcventry & ~0x1;
-
-       return EXP_TID_SET(IDX, pair >> 1) |
-               EXP_TID_SET(CTRL, 1 << (rcventry - pair));
-}
-
-static inline void exp_tid_group_init(struct exp_tid_set *set)
-{
-       INIT_LIST_HEAD(&set->list);
-       set->count = 0;
-}
-
-static inline void tid_group_remove(struct tid_group *grp,
-                                   struct exp_tid_set *set)
-{
-       list_del_init(&grp->list);
-       set->count--;
-}
-
-static inline void tid_group_add_tail(struct tid_group *grp,
-                                     struct exp_tid_set *set)
-{
-       list_add_tail(&grp->list, &set->list);
-       set->count++;
-}
-
-static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
-{
-       struct tid_group *grp =
-               list_first_entry(&set->list, struct tid_group, list);
-       list_del_init(&grp->list);
-       set->count--;
-       return grp;
-}
-
-static inline void tid_group_move(struct tid_group *group,
-                                 struct exp_tid_set *s1,
-                                 struct exp_tid_set *s2)
-{
-       tid_group_remove(group, s1);
-       tid_group_add_tail(group, s2);
-}
-
-/*
- * Initialize context and file private data needed for Expected
- * receive caching. This needs to be done after the context has
- * been configured with the eager/expected RcvEntry counts.
- */
-int hfi1_user_exp_rcv_init(struct file *fp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned tidbase;
-       int i, ret = 0;
-
-       spin_lock_init(&fd->tid_lock);
-       spin_lock_init(&fd->invalid_lock);
-       fd->tid_rb_root = RB_ROOT;
-
-       if (!uctxt->subctxt_cnt || !fd->subctxt) {
-               exp_tid_group_init(&uctxt->tid_group_list);
-               exp_tid_group_init(&uctxt->tid_used_list);
-               exp_tid_group_init(&uctxt->tid_full_list);
-
-               tidbase = uctxt->expected_base;
-               for (i = 0; i < uctxt->expected_count /
-                            dd->rcv_entries.group_size; i++) {
-                       struct tid_group *grp;
-
-                       grp = kzalloc(sizeof(*grp), GFP_KERNEL);
-                       if (!grp) {
-                               /*
-                                * If we fail here, the groups already
-                                * allocated will be freed by the close
-                                * call.
-                                */
-                               ret = -ENOMEM;
-                               goto done;
-                       }
-                       grp->size = dd->rcv_entries.group_size;
-                       grp->base = tidbase;
-                       tid_group_add_tail(grp, &uctxt->tid_group_list);
-                       tidbase += dd->rcv_entries.group_size;
-               }
-       }
-
-       fd->entry_to_rb = kcalloc(uctxt->expected_count,
-                                    sizeof(struct rb_node *),
-                                    GFP_KERNEL);
-       if (!fd->entry_to_rb)
-               return -ENOMEM;
-
-       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
-               fd->invalid_tid_idx = 0;
-               fd->invalid_tids = kzalloc(uctxt->expected_count *
-                                          sizeof(u32), GFP_KERNEL);
-               if (!fd->invalid_tids) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-
-               /*
-                * Register MMU notifier callbacks. If the registration
-                * fails, continue but turn off the TID caching for
-                * all user contexts.
-                */
-               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
-               if (ret) {
-                       dd_dev_info(dd,
-                                   "Failed MMU notifier registration %d\n",
-                                   ret);
-                       HFI1_CAP_USET(TID_UNMAP);
-                       ret = 0;
-               }
-       }
-
-       /*
-        * PSM does not have a good way to separate, count, and
-        * effectively enforce a limit on RcvArray entries used by
-        * subctxts (when context sharing is used) when TID caching
-        * is enabled. To help with that, we calculate a per-process
-        * RcvArray entry share and enforce that.
-        * If TID caching is not in use, PSM deals with usage on its
-        * own. In that case, we allow any subctxt to take all of the
-        * entries.
-        *
-        * Make sure that we set the tid counts only after successful
-        * init.
-        */
-       spin_lock(&fd->tid_lock);
-       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
-               u16 remainder;
-
-               fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
-               remainder = uctxt->expected_count % uctxt->subctxt_cnt;
-               if (remainder && fd->subctxt < remainder)
-                       fd->tid_limit++;
-       } else {
-               fd->tid_limit = uctxt->expected_count;
-       }
-       spin_unlock(&fd->tid_lock);
-done:
-       return ret;
-}
-
-int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct tid_group *grp, *gptr;
-
-       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
-               return 0;
-       /*
-        * The notifier would have been removed when the process'es mm
-        * was freed.
-        */
-       if (!HFI1_CAP_IS_USET(TID_UNMAP))
-               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
-
-       kfree(fd->invalid_tids);
-
-       if (!uctxt->cnt) {
-               if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
-                                       &fd->tid_rb_root);
-               if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
-                                       &fd->tid_rb_root);
-               list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
-                                        list) {
-                       list_del_init(&grp->list);
-                       kfree(grp);
-               }
-               hfi1_clear_tids(uctxt);
-       }
-
-       kfree(fd->entry_to_rb);
-       return 0;
-}
-
-/*
- * Write an "empty" RcvArray entry.
- * This function exists so the TID registaration code can use it
- * to write to unused/unneeded entries and still take advantage
- * of the WC performance improvements. The HFI will ignore this
- * write to the RcvArray entry.
- */
-static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
-{
-       /*
-        * Doing the WC fill writes only makes sense if the device is
-        * present and the RcvArray has been mapped as WC memory.
-        */
-       if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
-               writeq(0, dd->rcvarray_wc + (index * 8));
-}
-
-/*
- * RcvArray entry allocation for Expected Receives is done by the
- * following algorithm:
- *
- * The context keeps 3 lists of groups of RcvArray entries:
- *   1. List of empty groups - tid_group_list
- *      This list is created during user context creation and
- *      contains elements which describe sets (of 8) of empty
- *      RcvArray entries.
- *   2. List of partially used groups - tid_used_list
- *      This list contains sets of RcvArray entries which are
- *      not completely used up. Another mapping request could
- *      use some of all of the remaining entries.
- *   3. List of full groups - tid_full_list
- *      This is the list where sets that are completely used
- *      up go.
- *
- * An attempt to optimize the usage of RcvArray entries is
- * made by finding all sets of physically contiguous pages in a
- * user's buffer.
- * These physically contiguous sets are further split into
- * sizes supported by the receive engine of the HFI. The
- * resulting sets of pages are stored in struct tid_pageset,
- * which describes the sets as:
- *    * .count - number of pages in this set
- *    * .idx - starting index into struct page ** array
- *                    of this set
- *
- * From this point on, the algorithm deals with the page sets
- * described above. The number of pagesets is divided by the
- * RcvArray group size to produce the number of full groups
- * needed.
- *
- * Groups from the 3 lists are manipulated using the following
- * rules:
- *   1. For each set of 8 pagesets, a complete group from
- *      tid_group_list is taken, programmed, and moved to
- *      the tid_full_list list.
- *   2. For all remaining pagesets:
- *      2.1 If the tid_used_list is empty and the tid_group_list
- *          is empty, stop processing pageset and return only
- *          what has been programmed up to this point.
- *      2.2 If the tid_used_list is empty and the tid_group_list
- *          is not empty, move a group from tid_group_list to
- *          tid_used_list.
- *      2.3 For each group is tid_used_group, program as much as
- *          can fit into the group. If the group becomes fully
- *          used, move it to tid_full_list.
- */
-int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       int ret = 0, need_group = 0, pinned;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
-               tididx = 0, mapped, mapped_pages = 0;
-       unsigned long vaddr = tinfo->vaddr;
-       struct page **pages = NULL;
-       u32 *tidlist = NULL;
-       struct tid_pageset *pagesets = NULL;
-
-       /* Get the number of pages the user buffer spans */
-       npages = num_user_pages(vaddr, tinfo->length);
-       if (!npages)
-               return -EINVAL;
-
-       if (npages > uctxt->expected_count) {
-               dd_dev_err(dd, "Expected buffer too big\n");
-               return -EINVAL;
-       }
-
-       /* Verify that access is OK for the user buffer */
-       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
-                      npages * PAGE_SIZE)) {
-               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
-                          (void *)vaddr, npages);
-               return -EFAULT;
-       }
-
-       pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
-                          GFP_KERNEL);
-       if (!pagesets)
-               return -ENOMEM;
-
-       /* Allocate the array of struct page pointers needed for pinning */
-       pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       /*
-        * Pin all the pages of the user buffer. If we can't pin all the
-        * pages, accept the amount pinned so far and program only that.
-        * User space knows how to deal with partially programmed buffers.
-        */
-       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
-       if (pinned <= 0) {
-               ret = pinned;
-               goto bail;
-       }
-       fd->tid_n_pinned += npages;
-
-       /* Find sets of physically contiguous pages */
-       npagesets = find_phys_blocks(pages, pinned, pagesets);
-
-       /*
-        * We don't need to access this under a lock since tid_used is per
-        * process and the same process cannot be in hfi1_user_exp_rcv_clear()
-        * and hfi1_user_exp_rcv_setup() at the same time.
-        */
-       spin_lock(&fd->tid_lock);
-       if (fd->tid_used + npagesets > fd->tid_limit)
-               pageset_count = fd->tid_limit - fd->tid_used;
-       else
-               pageset_count = npagesets;
-       spin_unlock(&fd->tid_lock);
-
-       if (!pageset_count)
-               goto bail;
-
-       ngroups = pageset_count / dd->rcv_entries.group_size;
-       tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
-       if (!tidlist) {
-               ret = -ENOMEM;
-               goto nomem;
-       }
-
-       tididx = 0;
-
-       /*
-        * From this point on, we are going to be using shared (between master
-        * and subcontexts) context resources. We need to take the lock.
-        */
-       mutex_lock(&uctxt->exp_lock);
-       /*
-        * The first step is to program the RcvArray entries which are complete
-        * groups.
-        */
-       while (ngroups && uctxt->tid_group_list.count) {
-               struct tid_group *grp =
-                       tid_group_pop(&uctxt->tid_group_list);
-
-               ret = program_rcvarray(fp, vaddr, grp, pagesets,
-                                      pageidx, dd->rcv_entries.group_size,
-                                      pages, tidlist, &tididx, &mapped);
-               /*
-                * If there was a failure to program the RcvArray
-                * entries for the entire group, reset the grp fields
-                * and add the grp back to the free group list.
-                */
-               if (ret <= 0) {
-                       tid_group_add_tail(grp, &uctxt->tid_group_list);
-                       hfi1_cdbg(TID,
-                                 "Failed to program RcvArray group %d", ret);
-                       goto unlock;
-               }
-
-               tid_group_add_tail(grp, &uctxt->tid_full_list);
-               ngroups--;
-               pageidx += ret;
-               mapped_pages += mapped;
-       }
-
-       while (pageidx < pageset_count) {
-               struct tid_group *grp, *ptr;
-               /*
-                * If we don't have any partially used tid groups, check
-                * if we have empty groups. If so, take one from there and
-                * put in the partially used list.
-                */
-               if (!uctxt->tid_used_list.count || need_group) {
-                       if (!uctxt->tid_group_list.count)
-                               goto unlock;
-
-                       grp = tid_group_pop(&uctxt->tid_group_list);
-                       tid_group_add_tail(grp, &uctxt->tid_used_list);
-                       need_group = 0;
-               }
-               /*
-                * There is an optimization opportunity here - instead of
-                * fitting as many page sets as we can, check for a group
-                * later on in the list that could fit all of them.
-                */
-               list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
-                                        list) {
-                       unsigned use = min_t(unsigned, pageset_count - pageidx,
-                                            grp->size - grp->used);
-
-                       ret = program_rcvarray(fp, vaddr, grp, pagesets,
-                                              pageidx, use, pages, tidlist,
-                                              &tididx, &mapped);
-                       if (ret < 0) {
-                               hfi1_cdbg(TID,
-                                         "Failed to program RcvArray entries %d",
-                                         ret);
-                               ret = -EFAULT;
-                               goto unlock;
-                       } else if (ret > 0) {
-                               if (grp->used == grp->size)
-                                       tid_group_move(grp,
-                                                      &uctxt->tid_used_list,
-                                                      &uctxt->tid_full_list);
-                               pageidx += ret;
-                               mapped_pages += mapped;
-                               need_group = 0;
-                               /* Check if we are done so we break out early */
-                               if (pageidx >= pageset_count)
-                                       break;
-                       } else if (WARN_ON(ret == 0)) {
-                               /*
-                                * If ret is 0, we did not program any entries
-                                * into this group, which can only happen if
-                                * we've screwed up the accounting somewhere.
-                                * Warn and try to continue.
-                                */
-                               need_group = 1;
-                       }
-               }
-       }
-unlock:
-       mutex_unlock(&uctxt->exp_lock);
-nomem:
-       hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
-                 mapped_pages, ret);
-       if (tididx) {
-               spin_lock(&fd->tid_lock);
-               fd->tid_used += tididx;
-               spin_unlock(&fd->tid_lock);
-               tinfo->tidcnt = tididx;
-               tinfo->length = mapped_pages * PAGE_SIZE;
-
-               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
-                                tidlist, sizeof(tidlist[0]) * tididx)) {
-                       /*
-                        * On failure to copy to the user level, we need to undo
-                        * everything done so far so we don't leak resources.
-                        */
-                       tinfo->tidlist = (unsigned long)&tidlist;
-                       hfi1_user_exp_rcv_clear(fp, tinfo);
-                       tinfo->tidlist = 0;
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       }
-
-       /*
-        * If not everything was mapped (due to insufficient RcvArray entries,
-        * for example), unpin all unmapped pages so we can pin them nex time.
-        */
-       if (mapped_pages != pinned) {
-               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
-                                       pinned - mapped_pages,
-                                       false);
-               fd->tid_n_pinned -= pinned - mapped_pages;
-       }
-bail:
-       kfree(pagesets);
-       kfree(pages);
-       kfree(tidlist);
-       return ret > 0 ? 0 : ret;
-}
-
-int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       int ret = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       u32 *tidinfo;
-       unsigned tididx;
-
-       tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
-       if (!tidinfo)
-               return -ENOMEM;
-
-       if (copy_from_user(tidinfo, (void __user *)(unsigned long)
-                          tinfo->tidlist, sizeof(tidinfo[0]) *
-                          tinfo->tidcnt)) {
-               ret = -EFAULT;
-               goto done;
-       }
-
-       mutex_lock(&uctxt->exp_lock);
-       for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
-               ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
-               if (ret) {
-                       hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
-                                 ret);
-                       break;
-               }
-       }
-       spin_lock(&fd->tid_lock);
-       fd->tid_used -= tididx;
-       spin_unlock(&fd->tid_lock);
-       tinfo->tidcnt = tididx;
-       mutex_unlock(&uctxt->exp_lock);
-done:
-       kfree(tidinfo);
-       return ret;
-}
-
-int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       unsigned long *ev = uctxt->dd->events +
-               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
-                 HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
-       u32 *array;
-       int ret = 0;
-
-       if (!fd->invalid_tids)
-               return -EINVAL;
-
-       /*
-        * copy_to_user() can sleep, which will leave the invalid_lock
-        * locked and cause the MMU notifier to be blocked on the lock
-        * for a long time.
-        * Copy the data to a local buffer so we can release the lock.
-        */
-       array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
-       if (!array)
-               return -EFAULT;
-
-       spin_lock(&fd->invalid_lock);
-       if (fd->invalid_tid_idx) {
-               memcpy(array, fd->invalid_tids, sizeof(*array) *
-                      fd->invalid_tid_idx);
-               memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
-                      fd->invalid_tid_idx);
-               tinfo->tidcnt = fd->invalid_tid_idx;
-               fd->invalid_tid_idx = 0;
-               /*
-                * Reset the user flag while still holding the lock.
-                * Otherwise, PSM can miss events.
-                */
-               clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
-       } else {
-               tinfo->tidcnt = 0;
-       }
-       spin_unlock(&fd->invalid_lock);
-
-       if (tinfo->tidcnt) {
-               if (copy_to_user((void __user *)tinfo->tidlist,
-                                array, sizeof(*array) * tinfo->tidcnt))
-                       ret = -EFAULT;
-       }
-       kfree(array);
-
-       return ret;
-}
-
-static u32 find_phys_blocks(struct page **pages, unsigned npages,
-                           struct tid_pageset *list)
-{
-       unsigned pagecount, pageidx, setcount = 0, i;
-       unsigned long pfn, this_pfn;
-
-       if (!npages)
-               return 0;
-
-       /*
-        * Look for sets of physically contiguous pages in the user buffer.
-        * This will allow us to optimize Expected RcvArray entry usage by
-        * using the bigger supported sizes.
-        */
-       pfn = page_to_pfn(pages[0]);
-       for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
-               this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
-
-               /*
-                * If the pfn's are not sequential, pages are not physically
-                * contiguous.
-                */
-               if (this_pfn != ++pfn) {
-                       /*
-                        * At this point we have to loop over the set of
-                        * physically contiguous pages and break them down it
-                        * sizes supported by the HW.
-                        * There are two main constraints:
-                        *     1. The max buffer size is MAX_EXPECTED_BUFFER.
-                        *        If the total set size is bigger than that
-                        *        program only a MAX_EXPECTED_BUFFER chunk.
-                        *     2. The buffer size has to be a power of two. If
-                        *        it is not, round down to the closes power of
-                        *        2 and program that size.
-                        */
-                       while (pagecount) {
-                               int maxpages = pagecount;
-                               u32 bufsize = pagecount * PAGE_SIZE;
-
-                               if (bufsize > MAX_EXPECTED_BUFFER)
-                                       maxpages =
-                                               MAX_EXPECTED_BUFFER >>
-                                               PAGE_SHIFT;
-                               else if (!is_power_of_2(bufsize))
-                                       maxpages =
-                                               rounddown_pow_of_two(bufsize) >>
-                                               PAGE_SHIFT;
-
-                               list[setcount].idx = pageidx;
-                               list[setcount].count = maxpages;
-                               pagecount -= maxpages;
-                               pageidx += maxpages;
-                               setcount++;
-                       }
-                       pageidx = i;
-                       pagecount = 1;
-                       pfn = this_pfn;
-               } else {
-                       pagecount++;
-               }
-       }
-       return setcount;
-}
-
-/**
- * program_rcvarray() - program an RcvArray group with receive buffers
- * @fp: file pointer
- * @vaddr: starting user virtual address
- * @grp: RcvArray group
- * @sets: array of struct tid_pageset holding information on physically
- *        contiguous chunks from the user buffer
- * @start: starting index into sets array
- * @count: number of struct tid_pageset's to program
- * @pages: an array of struct page * for the user buffer
- * @tidlist: the array of u32 elements when the information about the
- *           programmed RcvArray entries is to be encoded.
- * @tididx: starting offset into tidlist
- * @pmapped: (output parameter) number of pages programmed into the RcvArray
- *           entries.
- *
- * This function will program up to 'count' number of RcvArray entries from the
- * group 'grp'. To make best use of write-combining writes, the function will
- * perform writes to the unused RcvArray entries which will be ignored by the
- * HW. Each RcvArray entry will be programmed with a physically contiguous
- * buffer chunk from the user's virtual buffer.
- *
- * Return:
- * -EINVAL if the requested count is larger than the size of the group,
- * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
- * number of RcvArray entries programmed.
- */
-static int program_rcvarray(struct file *fp, unsigned long vaddr,
-                           struct tid_group *grp,
-                           struct tid_pageset *sets,
-                           unsigned start, u16 count, struct page **pages,
-                           u32 *tidlist, unsigned *tididx, unsigned *pmapped)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       u16 idx;
-       u32 tidinfo = 0, rcventry, useidx = 0;
-       int mapped = 0;
-
-       /* Count should never be larger than the group size */
-       if (count > grp->size)
-               return -EINVAL;
-
-       /* Find the first unused entry in the group */
-       for (idx = 0; idx < grp->size; idx++) {
-               if (!(grp->map & (1 << idx))) {
-                       useidx = idx;
-                       break;
-               }
-               rcv_array_wc_fill(dd, grp->base + idx);
-       }
-
-       idx = 0;
-       while (idx < count) {
-               u16 npages, pageidx, setidx = start + idx;
-               int ret = 0;
-
-               /*
-                * If this entry in the group is used, move to the next one.
-                * If we go past the end of the group, exit the loop.
-                */
-               if (useidx >= grp->size) {
-                       break;
-               } else if (grp->map & (1 << useidx)) {
-                       rcv_array_wc_fill(dd, grp->base + useidx);
-                       useidx++;
-                       continue;
-               }
-
-               rcventry = grp->base + useidx;
-               npages = sets[setidx].count;
-               pageidx = sets[setidx].idx;
-
-               ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
-                                        rcventry, grp, pages + pageidx,
-                                        npages);
-               if (ret)
-                       return ret;
-               mapped += npages;
-
-               tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
-                       EXP_TID_SET(LEN, npages);
-               tidlist[(*tididx)++] = tidinfo;
-               grp->used++;
-               grp->map |= 1 << useidx++;
-               idx++;
-       }
-
-       /* Fill the rest of the group with "blank" writes */
-       for (; useidx < grp->size; useidx++)
-               rcv_array_wc_fill(dd, grp->base + useidx);
-       *pmapped = mapped;
-       return idx;
-}
-
-static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
-                             u32 rcventry, struct tid_group *grp,
-                             struct page **pages, unsigned npages)
-{
-       int ret;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct tid_rb_node *node;
-       struct hfi1_devdata *dd = uctxt->dd;
-       struct rb_root *root = &fd->tid_rb_root;
-       dma_addr_t phys;
-
-       /*
-        * Allocate the node first so we can handle a potential
-        * failure before we've programmed anything.
-        */
-       node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
-                      GFP_KERNEL);
-       if (!node)
-               return -ENOMEM;
-
-       phys = pci_map_single(dd->pcidev,
-                             __va(page_to_phys(pages[0])),
-                             npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, phys)) {
-               dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
-                          phys);
-               kfree(node);
-               return -EFAULT;
-       }
-
-       node->mmu.addr = vaddr;
-       node->mmu.len = npages * PAGE_SIZE;
-       node->phys = page_to_phys(pages[0]);
-       node->npages = npages;
-       node->rcventry = rcventry;
-       node->dma_addr = phys;
-       node->grp = grp;
-       node->freed = false;
-       memcpy(node->pages, pages, sizeof(struct page *) * npages);
-
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               ret = mmu_rb_insert(root, &node->mmu);
-       else
-               ret = hfi1_mmu_rb_insert(root, &node->mmu);
-
-       if (ret) {
-               hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
-                         node->rcventry, node->mmu.addr, node->phys, ret);
-               pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
-                                PCI_DMA_FROMDEVICE);
-               kfree(node);
-               return -EFAULT;
-       }
-       hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
-       trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
-                              node->mmu.addr, node->phys, phys);
-       return 0;
-}
-
-static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
-                             struct tid_group **grp)
-{
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-       struct tid_rb_node *node;
-       u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
-       u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
-
-       if (tididx >= uctxt->expected_count) {
-               dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
-                          tididx, uctxt->ctxt);
-               return -EINVAL;
-       }
-
-       if (tidctrl == 0x3)
-               return -EINVAL;
-
-       rcventry = tididx + (tidctrl - 1);
-
-       node = fd->entry_to_rb[rcventry];
-       if (!node || node->rcventry != (uctxt->expected_base + rcventry))
-               return -EBADF;
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
-       else
-               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
-
-       if (grp)
-               *grp = node->grp;
-       clear_tid_node(fd, fd->subctxt, node);
-       return 0;
-}
-
-static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
-                          struct tid_rb_node *node)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_devdata *dd = uctxt->dd;
-
-       trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
-                                node->npages, node->mmu.addr, node->phys,
-                                node->dma_addr);
-
-       hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
-       /*
-        * Make sure device has seen the write before we unpin the
-        * pages.
-        */
-       flush_wc();
-
-       pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
-                        PCI_DMA_FROMDEVICE);
-       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
-       fd->tid_n_pinned -= node->npages;
-
-       node->grp->used--;
-       node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
-
-       if (node->grp->used == node->grp->size - 1)
-               tid_group_move(node->grp, &uctxt->tid_full_list,
-                              &uctxt->tid_used_list);
-       else if (!node->grp->used)
-               tid_group_move(node->grp, &uctxt->tid_used_list,
-                              &uctxt->tid_group_list);
-       kfree(node);
-}
-
-static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
-                           struct exp_tid_set *set, struct rb_root *root)
-{
-       struct tid_group *grp, *ptr;
-       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
-                                               tid_rb_root);
-       int i;
-
-       list_for_each_entry_safe(grp, ptr, &set->list, list) {
-               list_del_init(&grp->list);
-
-               for (i = 0; i < grp->size; i++) {
-                       if (grp->map & (1 << i)) {
-                               u16 rcventry = grp->base + i;
-                               struct tid_rb_node *node;
-
-                               node = fd->entry_to_rb[rcventry -
-                                                         uctxt->expected_base];
-                               if (!node || node->rcventry != rcventry)
-                                       continue;
-                               if (HFI1_CAP_IS_USET(TID_UNMAP))
-                                       mmu_rb_remove(&fd->tid_rb_root,
-                                                     &node->mmu, NULL);
-                               else
-                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
-                                                          &node->mmu);
-                               clear_tid_node(fd, -1, node);
-                       }
-               }
-       }
-}
-
-static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
-       struct tid_rb_node *node =
-               container_of(mnode, struct tid_rb_node, mmu);
-
-       if (node->freed)
-               return 0;
-
-       trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
-                                node->rcventry, node->npages, node->dma_addr);
-       node->freed = true;
-
-       spin_lock(&fdata->invalid_lock);
-       if (fdata->invalid_tid_idx < uctxt->expected_count) {
-               fdata->invalid_tids[fdata->invalid_tid_idx] =
-                       rcventry2tidinfo(node->rcventry - uctxt->expected_base);
-               fdata->invalid_tids[fdata->invalid_tid_idx] |=
-                       EXP_TID_SET(LEN, node->npages);
-               if (!fdata->invalid_tid_idx) {
-                       unsigned long *ev;
-
-                       /*
-                        * hfi1_set_uevent_bits() sets a user event flag
-                        * for all processes. Because calling into the
-                        * driver to process TID cache invalidations is
-                        * expensive and TID cache invalidations are
-                        * handled on a per-process basis, we can
-                        * optimize this to set the flag only for the
-                        * process in question.
-                        */
-                       ev = uctxt->dd->events +
-                               (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
-                                 HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
-                       set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
-               }
-               fdata->invalid_tid_idx++;
-       }
-       spin_unlock(&fdata->invalid_lock);
-       return 0;
-}
-
-static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
-       u32 base = fdata->uctxt->expected_base;
-
-       fdata->entry_to_rb[tnode->rcventry - base] = tnode;
-       return 0;
-}
-
-static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
-                         struct mm_struct *mm)
-{
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
-       u32 base = fdata->uctxt->expected_base;
-
-       fdata->entry_to_rb[tnode->rcventry - base] = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h
deleted file mode 100644 (file)
index 9bc8d9f..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef _HFI1_USER_EXP_RCV_H
-#define _HFI1_USER_EXP_RCV_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-
-#define EXP_TID_TIDLEN_MASK   0x7FFULL
-#define EXP_TID_TIDLEN_SHIFT  0
-#define EXP_TID_TIDCTRL_MASK  0x3ULL
-#define EXP_TID_TIDCTRL_SHIFT 20
-#define EXP_TID_TIDIDX_MASK   0x3FFULL
-#define EXP_TID_TIDIDX_SHIFT  22
-#define EXP_TID_GET(tid, field)        \
-       (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
-
-#define EXP_TID_SET(field, value)                      \
-       (((value) & EXP_TID_TID##field##_MASK) <<       \
-        EXP_TID_TID##field##_SHIFT)
-#define EXP_TID_CLEAR(tid, field) ({                                   \
-               (tid) &= ~(EXP_TID_TID##field##_MASK <<                 \
-                          EXP_TID_TID##field##_SHIFT);                 \
-               })
-#define EXP_TID_RESET(tid, field, value) do {                          \
-               EXP_TID_CLEAR(tid, field);                              \
-               (tid) |= EXP_TID_SET(field, (value));                   \
-       } while (0)
-
-int hfi1_user_exp_rcv_init(struct file *);
-int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
-int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
-int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
-int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
-
-#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
deleted file mode 100644 (file)
index 88e10b5..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/module.h>
-
-#include "hfi.h"
-
-static unsigned long cache_size = 256;
-module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
-
-/*
- * Determine whether the caller can pin pages.
- *
- * This function should be used in the implementation of buffer caches.
- * The cache implementation should call this function prior to attempting
- * to pin buffer pages in order to determine whether they should do so.
- * The function computes cache limits based on the configured ulimit and
- * cache size. Use of this function is especially important for caches
- * which are not limited in any other way (e.g. by HW resources) and, thus,
- * could keeping caching buffers.
- *
- */
-bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
-{
-       unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
-               size = (cache_size * (1UL << 20)); /* convert to bytes */
-       unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
-       bool can_lock = capable(CAP_IPC_LOCK);
-
-       /*
-        * Calculate per-cache size. The calculation below uses only a quarter
-        * of the available per-context limit. This leaves space for other
-        * pinning. Should we worry about shared ctxts?
-        */
-       cache_limit = (ulimit / usr_ctxts) / 4;
-
-       /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
-       if (ulimit != (-1UL) && size > cache_limit)
-               size = cache_limit;
-
-       /* Convert to number of pages */
-       size = DIV_ROUND_UP(size, PAGE_SIZE);
-
-       down_read(&current->mm->mmap_sem);
-       pinned = current->mm->pinned_vm;
-       up_read(&current->mm->mmap_sem);
-
-       /* First, check the absolute limit against all pinned pages. */
-       if (pinned + npages >= ulimit && !can_lock)
-               return false;
-
-       return ((nlocked + npages) <= size) || can_lock;
-}
-
-int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
-                           struct page **pages)
-{
-       int ret;
-
-       ret = get_user_pages_fast(vaddr, npages, writable, pages);
-       if (ret < 0)
-               return ret;
-
-       down_write(&current->mm->mmap_sem);
-       current->mm->pinned_vm += ret;
-       up_write(&current->mm->mmap_sem);
-
-       return ret;
-}
-
-void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
-                            size_t npages, bool dirty)
-{
-       size_t i;
-
-       for (i = 0; i < npages; i++) {
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
-
-       if (mm) { /* during close after signal, mm can be NULL */
-               down_write(&mm->mmap_sem);
-               mm->pinned_vm -= npages;
-               up_write(&mm->mmap_sem);
-       }
-}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
deleted file mode 100644 (file)
index 0014c9c..0000000
+++ /dev/null
@@ -1,1623 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/uio.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/mmu_context.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "sdma.h"
-#include "user_sdma.h"
-#include "verbs.h"  /* for the headers */
-#include "common.h" /* for struct hfi1_tid_info */
-#include "trace.h"
-#include "mmu_rb.h"
-
-static uint hfi1_sdma_comp_ring_size = 128;
-module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
-MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
-
-/* The maximum number of Data io vectors per message/request */
-#define MAX_VECTORS_PER_REQ 8
-/*
- * Maximum number of packet to send from each message/request
- * before moving to the next one.
- */
-#define MAX_PKTS_PER_QUEUE 16
-
-#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
-
-#define req_opcode(x) \
-       (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
-#define req_version(x) \
-       (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
-#define req_iovcnt(x) \
-       (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
-
-/* Number of BTH.PSN bits used for sequence number in expected rcvs */
-#define BTH_SEQ_MASK 0x7ffull
-
-/*
- * Define fields in the KDETH header so we can update the header
- * template.
- */
-#define KDETH_OFFSET_SHIFT        0
-#define KDETH_OFFSET_MASK         0x7fff
-#define KDETH_OM_SHIFT            15
-#define KDETH_OM_MASK             0x1
-#define KDETH_TID_SHIFT           16
-#define KDETH_TID_MASK            0x3ff
-#define KDETH_TIDCTRL_SHIFT       26
-#define KDETH_TIDCTRL_MASK        0x3
-#define KDETH_INTR_SHIFT          28
-#define KDETH_INTR_MASK           0x1
-#define KDETH_SH_SHIFT            29
-#define KDETH_SH_MASK             0x1
-#define KDETH_HCRC_UPPER_SHIFT    16
-#define KDETH_HCRC_UPPER_MASK     0xff
-#define KDETH_HCRC_LOWER_SHIFT    24
-#define KDETH_HCRC_LOWER_MASK     0xff
-
-#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
-#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
-
-#define KDETH_GET(val, field)                                          \
-       (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
-#define KDETH_SET(dw, field, val) do {                                 \
-               u32 dwval = le32_to_cpu(dw);                            \
-               dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
-               dwval |= (((val) & KDETH_##field##_MASK) << \
-                         KDETH_##field##_SHIFT);                       \
-               dw = cpu_to_le32(dwval);                                \
-       } while (0)
-
-#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                        \
-       do {                                                            \
-               if ((idx) < ARRAY_SIZE((arr)))                          \
-                       (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
-                               (__force u16)(value), (dw), (bit),      \
-                                                       (width));       \
-               else                                                    \
-                       return -ERANGE;                                 \
-       } while (0)
-
-/* KDETH OM multipliers and switch over point */
-#define KDETH_OM_SMALL     4
-#define KDETH_OM_LARGE     64
-#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
-
-/* Last packet in the request */
-#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
-
-#define SDMA_REQ_IN_USE     0
-#define SDMA_REQ_FOR_THREAD 1
-#define SDMA_REQ_SEND_DONE  2
-#define SDMA_REQ_HAVE_AHG   3
-#define SDMA_REQ_HAS_ERROR  4
-#define SDMA_REQ_DONE_ERROR 5
-
-#define SDMA_PKT_Q_INACTIVE BIT(0)
-#define SDMA_PKT_Q_ACTIVE   BIT(1)
-#define SDMA_PKT_Q_DEFERRED BIT(2)
-
-/*
- * Maximum retry attempts to submit a TX request
- * before putting the process to sleep.
- */
-#define MAX_DEFER_RETRY_COUNT 1
-
-static unsigned initial_pkt_count = 8;
-
-#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
-
-struct user_sdma_iovec {
-       struct list_head list;
-       struct iovec iov;
-       /* number of pages in this vector */
-       unsigned npages;
-       /* array of pinned pages for this vector */
-       struct page **pages;
-       /*
-        * offset into the virtual address space of the vector at
-        * which we last left off.
-        */
-       u64 offset;
-};
-
-#define SDMA_CACHE_NODE_EVICT BIT(0)
-
-struct sdma_mmu_node {
-       struct mmu_rb_node rb;
-       struct list_head list;
-       struct hfi1_user_sdma_pkt_q *pq;
-       atomic_t refcount;
-       struct page **pages;
-       unsigned npages;
-       unsigned long flags;
-};
-
-struct user_sdma_request {
-       struct sdma_req_info info;
-       struct hfi1_user_sdma_pkt_q *pq;
-       struct hfi1_user_sdma_comp_q *cq;
-       /* This is the original header from user space */
-       struct hfi1_pkt_header hdr;
-       /*
-        * Pointer to the SDMA engine for this request.
-        * Since different request could be on different VLs,
-        * each request will need it's own engine pointer.
-        */
-       struct sdma_engine *sde;
-       u8 ahg_idx;
-       u32 ahg[9];
-       /*
-        * KDETH.Offset (Eager) field
-        * We need to remember the initial value so the headers
-        * can be updated properly.
-        */
-       u32 koffset;
-       /*
-        * KDETH.OFFSET (TID) field
-        * The offset can cover multiple packets, depending on the
-        * size of the TID entry.
-        */
-       u32 tidoffset;
-       /*
-        * KDETH.OM
-        * Remember this because the header template always sets it
-        * to 0.
-        */
-       u8 omfactor;
-       /*
-        * We copy the iovs for this request (based on
-        * info.iovcnt). These are only the data vectors
-        */
-       unsigned data_iovs;
-       /* total length of the data in the request */
-       u32 data_len;
-       /* progress index moving along the iovs array */
-       unsigned iov_idx;
-       struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
-       /* number of elements copied to the tids array */
-       u16 n_tids;
-       /* TID array values copied from the tid_iov vector */
-       u32 *tids;
-       u16 tididx;
-       u32 sent;
-       u64 seqnum;
-       u64 seqcomp;
-       u64 seqsubmitted;
-       struct list_head txps;
-       unsigned long flags;
-       /* status of the last txreq completed */
-       int status;
-};
-
-/*
- * A single txreq could span up to 3 physical pages when the MTU
- * is sufficiently large (> 4K). Each of the IOV pointers also
- * needs it's own set of flags so the vector has been handled
- * independently of each other.
- */
-struct user_sdma_txreq {
-       /* Packet header for the txreq */
-       struct hfi1_pkt_header hdr;
-       struct sdma_txreq txreq;
-       struct list_head list;
-       struct user_sdma_request *req;
-       u16 flags;
-       unsigned busycount;
-       u64 seqnum;
-};
-
-#define SDMA_DBG(req, fmt, ...)                                     \
-       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
-                (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
-                ##__VA_ARGS__)
-#define SDMA_Q_DBG(pq, fmt, ...)                        \
-       hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
-                (pq)->subctxt, ##__VA_ARGS__)
-
-static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
-static int num_user_pages(const struct iovec *);
-static void user_sdma_txreq_cb(struct sdma_txreq *, int);
-static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
-static void user_sdma_free_request(struct user_sdma_request *, bool);
-static int pin_vector_pages(struct user_sdma_request *,
-                           struct user_sdma_iovec *);
-static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
-                              unsigned);
-static int check_header_template(struct user_sdma_request *,
-                                struct hfi1_pkt_header *, u32, u32);
-static int set_txreq_header(struct user_sdma_request *,
-                           struct user_sdma_txreq *, u32);
-static int set_txreq_header_ahg(struct user_sdma_request *,
-                               struct user_sdma_txreq *, u32);
-static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
-                                 struct hfi1_user_sdma_comp_q *,
-                                 u16, enum hfi1_sdma_comp_state, int);
-static inline u32 set_pkt_bth_psn(__be32, u8, u32);
-static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
-
-static int defer_packet_queue(
-       struct sdma_engine *,
-       struct iowait *,
-       struct sdma_txreq *,
-       unsigned seq);
-static void activate_packet_queue(struct iowait *, int);
-static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
-static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                          struct mm_struct *);
-static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
-
-static struct mmu_rb_ops sdma_rb_ops = {
-       .filter = sdma_rb_filter,
-       .insert = sdma_rb_insert,
-       .remove = sdma_rb_remove,
-       .invalidate = sdma_rb_invalidate
-};
-
-static int defer_packet_queue(
-       struct sdma_engine *sde,
-       struct iowait *wait,
-       struct sdma_txreq *txreq,
-       unsigned seq)
-{
-       struct hfi1_user_sdma_pkt_q *pq =
-               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-       struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
-       struct user_sdma_txreq *tx =
-               container_of(txreq, struct user_sdma_txreq, txreq);
-
-       if (sdma_progress(sde, seq, txreq)) {
-               if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
-                       goto eagain;
-       }
-       /*
-        * We are assuming that if the list is enqueued somewhere, it
-        * is to the dmawait list since that is the only place where
-        * it is supposed to be enqueued.
-        */
-       xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
-       write_seqlock(&dev->iowait_lock);
-       if (list_empty(&pq->busy.list))
-               list_add_tail(&pq->busy.list, &sde->dmawait);
-       write_sequnlock(&dev->iowait_lock);
-       return -EBUSY;
-eagain:
-       return -EAGAIN;
-}
-
-static void activate_packet_queue(struct iowait *wait, int reason)
-{
-       struct hfi1_user_sdma_pkt_q *pq =
-               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
-       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
-       wake_up(&wait->wait_dma);
-};
-
-static void sdma_kmem_cache_ctor(void *obj)
-{
-       struct user_sdma_txreq *tx = obj;
-
-       memset(tx, 0, sizeof(*tx));
-}
-
-int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
-{
-       struct hfi1_filedata *fd;
-       int ret = 0;
-       unsigned memsize;
-       char buf[64];
-       struct hfi1_devdata *dd;
-       struct hfi1_user_sdma_comp_q *cq;
-       struct hfi1_user_sdma_pkt_q *pq;
-       unsigned long flags;
-
-       if (!uctxt || !fp) {
-               ret = -EBADF;
-               goto done;
-       }
-
-       fd = fp->private_data;
-
-       if (!hfi1_sdma_comp_ring_size) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       dd = uctxt->dd;
-
-       pq = kzalloc(sizeof(*pq), GFP_KERNEL);
-       if (!pq)
-               goto pq_nomem;
-
-       memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
-       pq->reqs = kzalloc(memsize, GFP_KERNEL);
-       if (!pq->reqs)
-               goto pq_reqs_nomem;
-
-       INIT_LIST_HEAD(&pq->list);
-       pq->dd = dd;
-       pq->ctxt = uctxt->ctxt;
-       pq->subctxt = fd->subctxt;
-       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
-       pq->state = SDMA_PKT_Q_INACTIVE;
-       atomic_set(&pq->n_reqs, 0);
-       init_waitqueue_head(&pq->wait);
-       pq->sdma_rb_root = RB_ROOT;
-       INIT_LIST_HEAD(&pq->evict);
-       spin_lock_init(&pq->evict_lock);
-
-       iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
-                   activate_packet_queue, NULL);
-       pq->reqidx = 0;
-       snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
-                fd->subctxt);
-       pq->txreq_cache = kmem_cache_create(buf,
-                              sizeof(struct user_sdma_txreq),
-                                           L1_CACHE_BYTES,
-                                           SLAB_HWCACHE_ALIGN,
-                                           sdma_kmem_cache_ctor);
-       if (!pq->txreq_cache) {
-               dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
-                          uctxt->ctxt);
-               goto pq_txreq_nomem;
-       }
-       fd->pq = pq;
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               goto cq_nomem;
-
-       memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
-       cq->comps = vmalloc_user(memsize);
-       if (!cq->comps)
-               goto cq_comps_nomem;
-
-       cq->nentries = hfi1_sdma_comp_ring_size;
-       fd->cq = cq;
-
-       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
-       if (ret) {
-               dd_dev_err(dd, "Failed to register with MMU %d", ret);
-               goto done;
-       }
-
-       spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-       list_add(&pq->list, &uctxt->sdma_queues);
-       spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
-       goto done;
-
-cq_comps_nomem:
-       kfree(cq);
-cq_nomem:
-       kmem_cache_destroy(pq->txreq_cache);
-pq_txreq_nomem:
-       kfree(pq->reqs);
-pq_reqs_nomem:
-       kfree(pq);
-       fd->pq = NULL;
-pq_nomem:
-       ret = -ENOMEM;
-done:
-       return ret;
-}
-
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
-{
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_user_sdma_pkt_q *pq;
-       unsigned long flags;
-
-       hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
-                 uctxt->ctxt, fd->subctxt);
-       pq = fd->pq;
-       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
-       if (pq) {
-               spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-               if (!list_empty(&pq->list))
-                       list_del_init(&pq->list);
-               spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
-               iowait_sdma_drain(&pq->busy);
-               /* Wait until all requests have been freed. */
-               wait_event_interruptible(
-                       pq->wait,
-                       (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
-               kfree(pq->reqs);
-               kmem_cache_destroy(pq->txreq_cache);
-               kfree(pq);
-               fd->pq = NULL;
-       }
-       if (fd->cq) {
-               vfree(fd->cq->comps);
-               kfree(fd->cq);
-               fd->cq = NULL;
-       }
-       return 0;
-}
-
-int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
-                                  unsigned long dim, unsigned long *count)
-{
-       int ret = 0, i = 0;
-       struct hfi1_filedata *fd = fp->private_data;
-       struct hfi1_ctxtdata *uctxt = fd->uctxt;
-       struct hfi1_user_sdma_pkt_q *pq = fd->pq;
-       struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       struct hfi1_devdata *dd = pq->dd;
-       unsigned long idx = 0;
-       u8 pcount = initial_pkt_count;
-       struct sdma_req_info info;
-       struct user_sdma_request *req;
-       u8 opcode, sc, vl;
-
-       if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
-               hfi1_cdbg(
-                  SDMA,
-                  "[%u:%u:%u] First vector not big enough for header %lu/%lu",
-                  dd->unit, uctxt->ctxt, fd->subctxt,
-                  iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
-               return -EINVAL;
-       }
-       ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
-       if (ret) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
-                         dd->unit, uctxt->ctxt, fd->subctxt, ret);
-               return -EFAULT;
-       }
-
-       trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
-                                    (u16 *)&info);
-       if (cq->comps[info.comp_idx].status == QUEUED ||
-           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
-                         dd->unit, uctxt->ctxt, fd->subctxt,
-                         info.comp_idx);
-               return -EBADSLT;
-       }
-       if (!info.fragsize) {
-               hfi1_cdbg(SDMA,
-                         "[%u:%u:%u:%u] Request does not specify fragsize",
-                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
-               return -EINVAL;
-       }
-       /*
-        * We've done all the safety checks that we can up to this point,
-        * "allocate" the request entry.
-        */
-       hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
-                 uctxt->ctxt, fd->subctxt, info.comp_idx);
-       req = pq->reqs + info.comp_idx;
-       memset(req, 0, sizeof(*req));
-       /* Mark the request as IN_USE before we start filling it in. */
-       set_bit(SDMA_REQ_IN_USE, &req->flags);
-       req->data_iovs = req_iovcnt(info.ctrl) - 1;
-       req->pq = pq;
-       req->cq = cq;
-       req->status = -1;
-       INIT_LIST_HEAD(&req->txps);
-
-       memcpy(&req->info, &info, sizeof(info));
-
-       if (req_opcode(info.ctrl) == EXPECTED)
-               req->data_iovs--;
-
-       if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
-               SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
-                        MAX_VECTORS_PER_REQ);
-               return -EINVAL;
-       }
-       /* Copy the header from the user buffer */
-       ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
-                            sizeof(req->hdr));
-       if (ret) {
-               SDMA_DBG(req, "Failed to copy header template (%d)", ret);
-               ret = -EFAULT;
-               goto free_req;
-       }
-
-       /* If Static rate control is not enabled, sanitize the header. */
-       if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
-               req->hdr.pbc[2] = 0;
-
-       /* Validate the opcode. Do not trust packets from user space blindly. */
-       opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
-       if ((opcode & USER_OPCODE_CHECK_MASK) !=
-            USER_OPCODE_CHECK_VAL) {
-               SDMA_DBG(req, "Invalid opcode (%d)", opcode);
-               ret = -EINVAL;
-               goto free_req;
-       }
-       /*
-        * Validate the vl. Do not trust packets from user space blindly.
-        * VL comes from PBC, SC comes from LRH, and the VL needs to
-        * match the SC look up.
-        */
-       vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
-       sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
-             (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
-       if (vl >= dd->pport->vls_operational ||
-           vl != sc_to_vlt(dd, sc)) {
-               SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       /* Checking P_KEY for requests from user-space */
-       if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
-                             PKEY_CHECK_INVALID)) {
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       /*
-        * Also should check the BTH.lnh. If it says the next header is GRH then
-        * the RXE parsing will be off and will land in the middle of the KDETH
-        * or miss it entirely.
-        */
-       if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
-               SDMA_DBG(req, "User tried to pass in a GRH");
-               ret = -EINVAL;
-               goto free_req;
-       }
-
-       req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
-       /*
-        * Calculate the initial TID offset based on the values of
-        * KDETH.OFFSET and KDETH.OM that are passed in.
-        */
-       req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
-               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
-                KDETH_OM_LARGE : KDETH_OM_SMALL);
-       SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
-       idx++;
-
-       /* Save all the IO vector structures */
-       while (i < req->data_iovs) {
-               INIT_LIST_HEAD(&req->iovs[i].list);
-               memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
-               ret = pin_vector_pages(req, &req->iovs[i]);
-               if (ret) {
-                       req->status = ret;
-                       goto free_req;
-               }
-               req->data_len += req->iovs[i++].iov.iov_len;
-       }
-       SDMA_DBG(req, "total data length %u", req->data_len);
-
-       if (pcount > req->info.npkts)
-               pcount = req->info.npkts;
-       /*
-        * Copy any TID info
-        * User space will provide the TID info only when the
-        * request type is EXPECTED. This is true even if there is
-        * only one packet in the request and the header is already
-        * setup. The reason for the singular TID case is that the
-        * driver needs to perform safety checks.
-        */
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
-
-               if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
-                       ret = -EINVAL;
-                       goto free_req;
-               }
-               req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
-               if (!req->tids) {
-                       ret = -ENOMEM;
-                       goto free_req;
-               }
-               /*
-                * We have to copy all of the tids because they may vary
-                * in size and, therefore, the TID count might not be
-                * equal to the pkt count. However, there is no way to
-                * tell at this point.
-                */
-               ret = copy_from_user(req->tids, iovec[idx].iov_base,
-                                    ntids * sizeof(*req->tids));
-               if (ret) {
-                       SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
-                                ntids, ret);
-                       ret = -EFAULT;
-                       goto free_req;
-               }
-               req->n_tids = ntids;
-               idx++;
-       }
-
-       /* Have to select the engine */
-       req->sde = sdma_select_engine_vl(dd,
-                                        (u32)(uctxt->ctxt + fd->subctxt),
-                                        vl);
-       if (!req->sde || !sdma_running(req->sde)) {
-               ret = -ECOMM;
-               goto free_req;
-       }
-
-       /* We don't need an AHG entry if the request contains only one packet */
-       if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
-               int ahg = sdma_ahg_alloc(req->sde);
-
-               if (likely(ahg >= 0)) {
-                       req->ahg_idx = (u8)ahg;
-                       set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
-               }
-       }
-
-       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
-       atomic_inc(&pq->n_reqs);
-       /* Send the first N packets in the request to buy us some time */
-       ret = user_sdma_send_pkts(req, pcount);
-       if (unlikely(ret < 0 && ret != -EBUSY)) {
-               req->status = ret;
-               goto free_req;
-       }
-
-       /*
-        * It is possible that the SDMA engine would have processed all the
-        * submitted packets by the time we get here. Therefore, only set
-        * packet queue state to ACTIVE if there are still uncompleted
-        * requests.
-        */
-       if (atomic_read(&pq->n_reqs))
-               xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
-
-       /*
-        * This is a somewhat blocking send implementation.
-        * The driver will block the caller until all packets of the
-        * request have been submitted to the SDMA engine. However, it
-        * will not wait for send completions.
-        */
-       while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
-               ret = user_sdma_send_pkts(req, pcount);
-               if (ret < 0) {
-                       if (ret != -EBUSY) {
-                               req->status = ret;
-                               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-                               if (ACCESS_ONCE(req->seqcomp) ==
-                                   req->seqsubmitted - 1)
-                                       goto free_req;
-                               return ret;
-                       }
-                       wait_event_interruptible_timeout(
-                               pq->busy.wait_dma,
-                               (pq->state == SDMA_PKT_Q_ACTIVE),
-                               msecs_to_jiffies(
-                                       SDMA_IOWAIT_TIMEOUT));
-               }
-       }
-       *count += idx;
-       return 0;
-free_req:
-       user_sdma_free_request(req, true);
-       pq_update(pq);
-       set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
-       return ret;
-}
-
-static inline u32 compute_data_length(struct user_sdma_request *req,
-                                     struct user_sdma_txreq *tx)
-{
-       /*
-        * Determine the proper size of the packet data.
-        * The size of the data of the first packet is in the header
-        * template. However, it includes the header and ICRC, which need
-        * to be subtracted.
-        * The size of the remaining packets is the minimum of the frag
-        * size (MTU) or remaining data in the request.
-        */
-       u32 len;
-
-       if (!req->seqnum) {
-               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
-                      (sizeof(tx->hdr) - 4));
-       } else if (req_opcode(req->info.ctrl) == EXPECTED) {
-               u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
-                       PAGE_SIZE;
-               /*
-                * Get the data length based on the remaining space in the
-                * TID pair.
-                */
-               len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
-               /* If we've filled up the TID pair, move to the next one. */
-               if (unlikely(!len) && ++req->tididx < req->n_tids &&
-                   req->tids[req->tididx]) {
-                       tidlen = EXP_TID_GET(req->tids[req->tididx],
-                                            LEN) * PAGE_SIZE;
-                       req->tidoffset = 0;
-                       len = min_t(u32, tidlen, req->info.fragsize);
-               }
-               /*
-                * Since the TID pairs map entire pages, make sure that we
-                * are not going to try to send more data that we have
-                * remaining.
-                */
-               len = min(len, req->data_len - req->sent);
-       } else {
-               len = min(req->data_len - req->sent, (u32)req->info.fragsize);
-       }
-       SDMA_DBG(req, "Data Length = %u", len);
-       return len;
-}
-
-static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
-{
-       /* (Size of complete header - size of PBC) + 4B ICRC + data length */
-       return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
-}
-
-static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
-{
-       int ret = 0;
-       unsigned npkts = 0;
-       struct user_sdma_txreq *tx = NULL;
-       struct hfi1_user_sdma_pkt_q *pq = NULL;
-       struct user_sdma_iovec *iovec = NULL;
-
-       if (!req->pq)
-               return -EINVAL;
-
-       pq = req->pq;
-
-       /* If tx completion has reported an error, we are done. */
-       if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-               set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-               return -EFAULT;
-       }
-
-       /*
-        * Check if we might have sent the entire request already
-        */
-       if (unlikely(req->seqnum == req->info.npkts)) {
-               if (!list_empty(&req->txps))
-                       goto dosend;
-               return ret;
-       }
-
-       if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
-               maxpkts = req->info.npkts - req->seqnum;
-
-       while (npkts < maxpkts) {
-               u32 datalen = 0, queued = 0, data_sent = 0;
-               u64 iov_offset = 0;
-
-               /*
-                * Check whether any of the completions have come back
-                * with errors. If so, we are not going to process any
-                * more packets from this request.
-                */
-               if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-                       set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
-                       return -EFAULT;
-               }
-
-               tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
-               if (!tx)
-                       return -ENOMEM;
-
-               tx->flags = 0;
-               tx->req = req;
-               tx->busycount = 0;
-               INIT_LIST_HEAD(&tx->list);
-
-               if (req->seqnum == req->info.npkts - 1)
-                       tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
-
-               /*
-                * Calculate the payload size - this is min of the fragment
-                * (MTU) size or the remaining bytes in the request but only
-                * if we have payload data.
-                */
-               if (req->data_len) {
-                       iovec = &req->iovs[req->iov_idx];
-                       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
-                               if (++req->iov_idx == req->data_iovs) {
-                                       ret = -EFAULT;
-                                       goto free_txreq;
-                               }
-                               iovec = &req->iovs[req->iov_idx];
-                               WARN_ON(iovec->offset);
-                       }
-
-                       datalen = compute_data_length(req, tx);
-                       if (!datalen) {
-                               SDMA_DBG(req,
-                                        "Request has data but pkt len is 0");
-                               ret = -EFAULT;
-                               goto free_tx;
-                       }
-               }
-
-               if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
-                       if (!req->seqnum) {
-                               u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
-                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
-                               /*
-                                * Copy the request header into the tx header
-                                * because the HW needs a cacheline-aligned
-                                * address.
-                                * This copy can be optimized out if the hdr
-                                * member of user_sdma_request were also
-                                * cacheline aligned.
-                                */
-                               memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
-                               if (PBC2LRH(pbclen) != lrhlen) {
-                                       pbclen = (pbclen & 0xf000) |
-                                               LRH2PBC(lrhlen);
-                                       tx->hdr.pbc[0] = cpu_to_le16(pbclen);
-                               }
-                               ret = sdma_txinit_ahg(&tx->txreq,
-                                                     SDMA_TXREQ_F_AHG_COPY,
-                                                     sizeof(tx->hdr) + datalen,
-                                                     req->ahg_idx, 0, NULL, 0,
-                                                     user_sdma_txreq_cb);
-                               if (ret)
-                                       goto free_tx;
-                               ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
-                                                       &tx->hdr,
-                                                       sizeof(tx->hdr));
-                               if (ret)
-                                       goto free_txreq;
-                       } else {
-                               int changes;
-
-                               changes = set_txreq_header_ahg(req, tx,
-                                                              datalen);
-                               if (changes < 0)
-                                       goto free_tx;
-                               sdma_txinit_ahg(&tx->txreq,
-                                               SDMA_TXREQ_F_USE_AHG,
-                                               datalen, req->ahg_idx, changes,
-                                               req->ahg, sizeof(req->hdr),
-                                               user_sdma_txreq_cb);
-                       }
-               } else {
-                       ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
-                                         datalen, user_sdma_txreq_cb);
-                       if (ret)
-                               goto free_tx;
-                       /*
-                        * Modify the header for this packet. This only needs
-                        * to be done if we are not going to use AHG. Otherwise,
-                        * the HW will do it based on the changes we gave it
-                        * during sdma_txinit_ahg().
-                        */
-                       ret = set_txreq_header(req, tx, datalen);
-                       if (ret)
-                               goto free_txreq;
-               }
-
-               /*
-                * If the request contains any data vectors, add up to
-                * fragsize bytes to the descriptor.
-                */
-               while (queued < datalen &&
-                      (req->sent + data_sent) < req->data_len) {
-                       unsigned long base, offset;
-                       unsigned pageidx, len;
-
-                       base = (unsigned long)iovec->iov.iov_base;
-                       offset = offset_in_page(base + iovec->offset +
-                                               iov_offset);
-                       pageidx = (((iovec->offset + iov_offset +
-                                    base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
-                       len = offset + req->info.fragsize > PAGE_SIZE ?
-                               PAGE_SIZE - offset : req->info.fragsize;
-                       len = min((datalen - queued), len);
-                       ret = sdma_txadd_page(pq->dd, &tx->txreq,
-                                             iovec->pages[pageidx],
-                                             offset, len);
-                       if (ret) {
-                               SDMA_DBG(req, "SDMA txreq add page failed %d\n",
-                                        ret);
-                               goto free_txreq;
-                       }
-                       iov_offset += len;
-                       queued += len;
-                       data_sent += len;
-                       if (unlikely(queued < datalen &&
-                                    pageidx == iovec->npages &&
-                                    req->iov_idx < req->data_iovs - 1)) {
-                               iovec->offset += iov_offset;
-                               iovec = &req->iovs[++req->iov_idx];
-                               iov_offset = 0;
-                       }
-               }
-               /*
-                * The txreq was submitted successfully so we can update
-                * the counters.
-                */
-               req->koffset += datalen;
-               if (req_opcode(req->info.ctrl) == EXPECTED)
-                       req->tidoffset += datalen;
-               req->sent += data_sent;
-               if (req->data_len)
-                       iovec->offset += iov_offset;
-               list_add_tail(&tx->txreq.list, &req->txps);
-               /*
-                * It is important to increment this here as it is used to
-                * generate the BTH.PSN and, therefore, can't be bulk-updated
-                * outside of the loop.
-                */
-               tx->seqnum = req->seqnum++;
-               npkts++;
-       }
-dosend:
-       ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
-       if (list_empty(&req->txps)) {
-               req->seqsubmitted = req->seqnum;
-               if (req->seqnum == req->info.npkts) {
-                       set_bit(SDMA_REQ_SEND_DONE, &req->flags);
-                       /*
-                        * The txreq has already been submitted to the HW queue
-                        * so we can free the AHG entry now. Corruption will not
-                        * happen due to the sequential manner in which
-                        * descriptors are processed.
-                        */
-                       if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
-                               sdma_ahg_free(req->sde, req->ahg_idx);
-               }
-       } else if (ret > 0) {
-               req->seqsubmitted += ret;
-               ret = 0;
-       }
-       return ret;
-
-free_txreq:
-       sdma_txclean(pq->dd, &tx->txreq);
-free_tx:
-       kmem_cache_free(pq->txreq_cache, tx);
-       return ret;
-}
-
-/*
- * How many pages in this iovec element?
- */
-static inline int num_user_pages(const struct iovec *iov)
-{
-       const unsigned long addr  = (unsigned long)iov->iov_base;
-       const unsigned long len   = iov->iov_len;
-       const unsigned long spage = addr & PAGE_MASK;
-       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
-
-       return 1 + ((epage - spage) >> PAGE_SHIFT);
-}
-
-static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
-{
-       u32 cleared = 0;
-       struct sdma_mmu_node *node, *ptr;
-       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
-
-       spin_lock(&pq->evict_lock);
-       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
-               /* Make sure that no one is still using the node. */
-               if (!atomic_read(&node->refcount)) {
-                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
-                       list_del_init(&node->list);
-                       list_add(&node->list, &to_evict);
-                       cleared += node->npages;
-                       if (cleared >= npages)
-                               break;
-               }
-       }
-       spin_unlock(&pq->evict_lock);
-
-       list_for_each_entry_safe(node, ptr, &to_evict, list)
-               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
-
-       return cleared;
-}
-
-static int pin_vector_pages(struct user_sdma_request *req,
-                           struct user_sdma_iovec *iovec) {
-       int ret = 0, pinned, npages, cleared;
-       struct page **pages;
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct sdma_mmu_node *node = NULL;
-       struct mmu_rb_node *rb_node;
-
-       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
-                                     (unsigned long)iovec->iov.iov_base,
-                                     iovec->iov.iov_len);
-       if (rb_node && !IS_ERR(rb_node))
-               node = container_of(rb_node, struct sdma_mmu_node, rb);
-       else
-               rb_node = NULL;
-
-       if (!node) {
-               node = kzalloc(sizeof(*node), GFP_KERNEL);
-               if (!node)
-                       return -ENOMEM;
-
-               node->rb.addr = (unsigned long)iovec->iov.iov_base;
-               node->pq = pq;
-               atomic_set(&node->refcount, 0);
-               INIT_LIST_HEAD(&node->list);
-       }
-
-       npages = num_user_pages(&iovec->iov);
-       if (node->npages < npages) {
-               pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
-               if (!pages) {
-                       SDMA_DBG(req, "Failed page array alloc");
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-               memcpy(pages, node->pages, node->npages * sizeof(*pages));
-
-               npages -= node->npages;
-
-               /*
-                * If rb_node is NULL, it means that this is brand new node
-                * and, therefore not on the eviction list.
-                * If, however, the rb_node is non-NULL, it means that the
-                * node is already in RB tree and, therefore on the eviction
-                * list (nodes are unconditionally inserted in the eviction
-                * list). In that case, we have to remove the node prior to
-                * calling the eviction function in order to prevent it from
-                * freeing this node.
-                */
-               if (rb_node) {
-                       spin_lock(&pq->evict_lock);
-                       list_del_init(&node->list);
-                       spin_unlock(&pq->evict_lock);
-               }
-retry:
-               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
-                       cleared = sdma_cache_evict(pq, npages);
-                       if (cleared >= npages)
-                               goto retry;
-               }
-               pinned = hfi1_acquire_user_pages(
-                       ((unsigned long)iovec->iov.iov_base +
-                        (node->npages * PAGE_SIZE)), npages, 0,
-                       pages + node->npages);
-               if (pinned < 0) {
-                       kfree(pages);
-                       ret = pinned;
-                       goto bail;
-               }
-               if (pinned != npages) {
-                       unpin_vector_pages(current->mm, pages, node->npages,
-                                          pinned);
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               kfree(node->pages);
-               node->rb.len = iovec->iov.iov_len;
-               node->pages = pages;
-               node->npages += pinned;
-               npages = node->npages;
-               spin_lock(&pq->evict_lock);
-               list_add(&node->list, &pq->evict);
-               pq->n_locked += pinned;
-               spin_unlock(&pq->evict_lock);
-       }
-       iovec->pages = node->pages;
-       iovec->npages = npages;
-
-       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
-       if (ret) {
-               spin_lock(&pq->evict_lock);
-               if (!list_empty(&node->list))
-                       list_del(&node->list);
-               pq->n_locked -= node->npages;
-               spin_unlock(&pq->evict_lock);
-               goto bail;
-       }
-       return 0;
-bail:
-       if (rb_node)
-               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
-       kfree(node);
-       return ret;
-}
-
-static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
-                              unsigned start, unsigned npages)
-{
-       hfi1_release_user_pages(mm, pages + start, npages, 0);
-       kfree(pages);
-}
-
-static int check_header_template(struct user_sdma_request *req,
-                                struct hfi1_pkt_header *hdr, u32 lrhlen,
-                                u32 datalen)
-{
-       /*
-        * Perform safety checks for any type of packet:
-        *    - transfer size is multiple of 64bytes
-        *    - packet length is multiple of 4bytes
-        *    - entire request length is multiple of 4bytes
-        *    - packet length is not larger than MTU size
-        *
-        * These checks are only done for the first packet of the
-        * transfer since the header is "given" to us by user space.
-        * For the remainder of the packets we compute the values.
-        */
-       if (req->info.fragsize % PIO_BLOCK_SIZE ||
-           lrhlen & 0x3 || req->data_len & 0x3  ||
-           lrhlen > get_lrh_len(*hdr, req->info.fragsize))
-               return -EINVAL;
-
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               /*
-                * The header is checked only on the first packet. Furthermore,
-                * we ensure that at least one TID entry is copied when the
-                * request is submitted. Therefore, we don't have to verify that
-                * tididx points to something sane.
-                */
-               u32 tidval = req->tids[req->tididx],
-                       tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
-                       tididx = EXP_TID_GET(tidval, IDX),
-                       tidctrl = EXP_TID_GET(tidval, CTRL),
-                       tidoff;
-               __le32 kval = hdr->kdeth.ver_tid_offset;
-
-               tidoff = KDETH_GET(kval, OFFSET) *
-                         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
-                          KDETH_OM_LARGE : KDETH_OM_SMALL);
-               /*
-                * Expected receive packets have the following
-                * additional checks:
-                *     - offset is not larger than the TID size
-                *     - TIDCtrl values match between header and TID array
-                *     - TID indexes match between header and TID array
-                */
-               if ((tidoff + datalen > tidlen) ||
-                   KDETH_GET(kval, TIDCTRL) != tidctrl ||
-                   KDETH_GET(kval, TID) != tididx)
-                       return -EINVAL;
-       }
-       return 0;
-}
-
-/*
- * Correctly set the BTH.PSN field based on type of
- * transfer - eager packets can just increment the PSN but
- * expected packets encode generation and sequence in the
- * BTH.PSN field so just incrementing will result in errors.
- */
-static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
-{
-       u32 val = be32_to_cpu(bthpsn),
-               mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
-                       0xffffffull),
-               psn = val & mask;
-       if (expct)
-               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
-       else
-               psn = psn + frags;
-       return psn & mask;
-}
-
-static int set_txreq_header(struct user_sdma_request *req,
-                           struct user_sdma_txreq *tx, u32 datalen)
-{
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct hfi1_pkt_header *hdr = &tx->hdr;
-       u16 pbclen;
-       int ret;
-       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
-
-       /* Copy the header template to the request before modification */
-       memcpy(hdr, &req->hdr, sizeof(*hdr));
-
-       /*
-        * Check if the PBC and LRH length are mismatched. If so
-        * adjust both in the header.
-        */
-       pbclen = le16_to_cpu(hdr->pbc[0]);
-       if (PBC2LRH(pbclen) != lrhlen) {
-               pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
-               hdr->pbc[0] = cpu_to_le16(pbclen);
-               hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
-               /*
-                * Third packet
-                * This is the first packet in the sequence that has
-                * a "static" size that can be used for the rest of
-                * the packets (besides the last one).
-                */
-               if (unlikely(req->seqnum == 2)) {
-                       /*
-                        * From this point on the lengths in both the
-                        * PBC and LRH are the same until the last
-                        * packet.
-                        * Adjust the template so we don't have to update
-                        * every packet
-                        */
-                       req->hdr.pbc[0] = hdr->pbc[0];
-                       req->hdr.lrh[2] = hdr->lrh[2];
-               }
-       }
-       /*
-        * We only have to modify the header if this is not the
-        * first packet in the request. Otherwise, we use the
-        * header given to us.
-        */
-       if (unlikely(!req->seqnum)) {
-               ret = check_header_template(req, hdr, lrhlen, datalen);
-               if (ret)
-                       return ret;
-               goto done;
-       }
-
-       hdr->bth[2] = cpu_to_be32(
-               set_pkt_bth_psn(hdr->bth[2],
-                               (req_opcode(req->info.ctrl) == EXPECTED),
-                               req->seqnum));
-
-       /* Set ACK request on last packet */
-       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-               hdr->bth[2] |= cpu_to_be32(1UL << 31);
-
-       /* Set the new offset */
-       hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
-       /* Expected packets have to fill in the new TID information */
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               tidval = req->tids[req->tididx];
-               /*
-                * If the offset puts us at the end of the current TID,
-                * advance everything.
-                */
-               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
-                                        PAGE_SIZE)) {
-                       req->tidoffset = 0;
-                       /*
-                        * Since we don't copy all the TIDs, all at once,
-                        * we have to check again.
-                        */
-                       if (++req->tididx > req->n_tids - 1 ||
-                           !req->tids[req->tididx]) {
-                               return -EINVAL;
-                       }
-                       tidval = req->tids[req->tididx];
-               }
-               req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
-                       KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
-               /* Set KDETH.TIDCtrl based on value for this TID. */
-               KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
-                         EXP_TID_GET(tidval, CTRL));
-               /* Set KDETH.TID based on value for this TID */
-               KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
-                         EXP_TID_GET(tidval, IDX));
-               /* Clear KDETH.SH only on the last packet */
-               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-                       KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
-               /*
-                * Set the KDETH.OFFSET and KDETH.OM based on size of
-                * transfer.
-                */
-               SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
-                        req->tidoffset, req->tidoffset / req->omfactor,
-                        !!(req->omfactor - KDETH_OM_SMALL));
-               KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
-                         req->tidoffset / req->omfactor);
-               KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
-                         !!(req->omfactor - KDETH_OM_SMALL));
-       }
-done:
-       trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
-                                   req->info.comp_idx, hdr, tidval);
-       return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
-}
-
-static int set_txreq_header_ahg(struct user_sdma_request *req,
-                               struct user_sdma_txreq *tx, u32 len)
-{
-       int diff = 0;
-       struct hfi1_user_sdma_pkt_q *pq = req->pq;
-       struct hfi1_pkt_header *hdr = &req->hdr;
-       u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
-
-       if (PBC2LRH(pbclen) != lrhlen) {
-               /* PBC.PbcLengthDWs */
-               AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
-                              cpu_to_le16(LRH2PBC(lrhlen)));
-               /* LRH.PktLen (we need the full 16 bits due to byte swap) */
-               AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
-                              cpu_to_be16(lrhlen >> 2));
-       }
-
-       /*
-        * Do the common updates
-        */
-       /* BTH.PSN and BTH.A */
-       val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
-               (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
-       if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
-               val32 |= 1UL << 31;
-       AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
-       AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
-       /* KDETH.Offset */
-       AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
-                      cpu_to_le16(req->koffset & 0xffff));
-       AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
-                      cpu_to_le16(req->koffset >> 16));
-       if (req_opcode(req->info.ctrl) == EXPECTED) {
-               __le16 val;
-
-               tidval = req->tids[req->tididx];
-
-               /*
-                * If the offset puts us at the end of the current TID,
-                * advance everything.
-                */
-               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
-                                        PAGE_SIZE)) {
-                       req->tidoffset = 0;
-                       /*
-                        * Since we don't copy all the TIDs, all at once,
-                        * we have to check again.
-                        */
-                       if (++req->tididx > req->n_tids - 1 ||
-                           !req->tids[req->tididx]) {
-                               return -EINVAL;
-                       }
-                       tidval = req->tids[req->tididx];
-               }
-               req->omfactor = ((EXP_TID_GET(tidval, LEN) *
-                                 PAGE_SIZE) >=
-                                KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
-                       KDETH_OM_SMALL;
-               /* KDETH.OM and KDETH.OFFSET (TID) */
-               AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
-                              ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
-                               ((req->tidoffset / req->omfactor) & 0x7fff)));
-               /* KDETH.TIDCtrl, KDETH.TID */
-               val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
-                                       (EXP_TID_GET(tidval, IDX) & 0x3ff));
-               /* Clear KDETH.SH on last packet */
-               if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
-                       val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
-                                                               INTR) >> 16);
-                       val &= cpu_to_le16(~(1U << 13));
-                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
-               } else {
-                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
-               }
-       }
-
-       trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
-                                       req->info.comp_idx, req->sde->this_idx,
-                                       req->ahg_idx, req->ahg, diff, tidval);
-       return diff;
-}
-
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
- */
-static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
-{
-       struct user_sdma_txreq *tx =
-               container_of(txreq, struct user_sdma_txreq, txreq);
-       struct user_sdma_request *req;
-       struct hfi1_user_sdma_pkt_q *pq;
-       struct hfi1_user_sdma_comp_q *cq;
-       u16 idx;
-
-       if (!tx->req)
-               return;
-
-       req = tx->req;
-       pq = req->pq;
-       cq = req->cq;
-
-       if (status != SDMA_TXREQ_S_OK) {
-               SDMA_DBG(req, "SDMA completion with error %d",
-                        status);
-               set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
-       }
-
-       req->seqcomp = tx->seqnum;
-       kmem_cache_free(pq->txreq_cache, tx);
-       tx = NULL;
-
-       idx = req->info.comp_idx;
-       if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
-               if (req->seqcomp == req->info.npkts - 1) {
-                       req->status = 0;
-                       user_sdma_free_request(req, false);
-                       pq_update(pq);
-                       set_comp_state(pq, cq, idx, COMPLETE, 0);
-               }
-       } else {
-               if (status != SDMA_TXREQ_S_OK)
-                       req->status = status;
-               if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
-                   (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
-                    test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
-                       user_sdma_free_request(req, false);
-                       pq_update(pq);
-                       set_comp_state(pq, cq, idx, ERROR, req->status);
-               }
-       }
-}
-
-static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
-{
-       if (atomic_dec_and_test(&pq->n_reqs)) {
-               xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
-               wake_up(&pq->wait);
-       }
-}
-
-static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
-{
-       if (!list_empty(&req->txps)) {
-               struct sdma_txreq *t, *p;
-
-               list_for_each_entry_safe(t, p, &req->txps, list) {
-                       struct user_sdma_txreq *tx =
-                               container_of(t, struct user_sdma_txreq, txreq);
-                       list_del_init(&t->list);
-                       sdma_txclean(req->pq->dd, t);
-                       kmem_cache_free(req->pq->txreq_cache, tx);
-               }
-       }
-       if (req->data_iovs) {
-               struct sdma_mmu_node *node;
-               struct mmu_rb_node *mnode;
-               int i;
-
-               for (i = 0; i < req->data_iovs; i++) {
-                       mnode = hfi1_mmu_rb_search(
-                               &req->pq->sdma_rb_root,
-                               (unsigned long)req->iovs[i].iov.iov_base,
-                               req->iovs[i].iov.iov_len);
-                       if (!mnode || IS_ERR(mnode))
-                               continue;
-
-                       node = container_of(mnode, struct sdma_mmu_node, rb);
-                       if (unpin)
-                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
-                                                  &node->rb);
-                       else
-                               atomic_dec(&node->refcount);
-               }
-       }
-       kfree(req->tids);
-       clear_bit(SDMA_REQ_IN_USE, &req->flags);
-}
-
-static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
-                                 struct hfi1_user_sdma_comp_q *cq,
-                                 u16 idx, enum hfi1_sdma_comp_state state,
-                                 int ret)
-{
-       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
-                 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
-       cq->comps[idx].status = state;
-       if (state == ERROR)
-               cq->comps[idx].errcode = -ret;
-       trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
-                                       idx, state, ret);
-}
-
-static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
-                          unsigned long len)
-{
-       return (bool)(node->addr == addr);
-}
-
-static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       atomic_inc(&node->refcount);
-       return 0;
-}
-
-static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
-                          struct mm_struct *mm)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       spin_lock(&node->pq->evict_lock);
-       /*
-        * We've been called by the MMU notifier but this node has been
-        * scheduled for eviction. The eviction function will take care
-        * of freeing this node.
-        * We have to take the above lock first because we are racing
-        * against the setting of the bit in the eviction function.
-        */
-       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
-               spin_unlock(&node->pq->evict_lock);
-               return;
-       }
-
-       if (!list_empty(&node->list))
-               list_del(&node->list);
-       node->pq->n_locked -= node->npages;
-       spin_unlock(&node->pq->evict_lock);
-
-       /*
-        * If mm is set, we are being called by the MMU notifier and we
-        * should not pass a mm_struct to unpin_vector_page(). This is to
-        * prevent a deadlock when hfi1_release_user_pages() attempts to
-        * take the mmap_sem, which the MMU notifier has already taken.
-        */
-       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
-                          node->npages);
-       /*
-        * If called by the MMU notifier, we have to adjust the pinned
-        * page count ourselves.
-        */
-       if (mm)
-               mm->pinned_vm -= node->npages;
-       kfree(node);
-}
-
-static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
-{
-       struct sdma_mmu_node *node =
-               container_of(mnode, struct sdma_mmu_node, rb);
-
-       if (!atomic_read(&node->refcount))
-               return 1;
-       return 0;
-}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
deleted file mode 100644 (file)
index b9240e3..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include <linux/device.h>
-#include <linux/wait.h>
-
-#include "common.h"
-#include "iowait.h"
-#include "user_exp_rcv.h"
-
-extern uint extended_psn;
-
-struct hfi1_user_sdma_pkt_q {
-       struct list_head list;
-       unsigned ctxt;
-       unsigned subctxt;
-       u16 n_max_reqs;
-       atomic_t n_reqs;
-       u16 reqidx;
-       struct hfi1_devdata *dd;
-       struct kmem_cache *txreq_cache;
-       struct user_sdma_request *reqs;
-       struct iowait busy;
-       unsigned state;
-       wait_queue_head_t wait;
-       unsigned long unpinned;
-       struct rb_root sdma_rb_root;
-       u32 n_locked;
-       struct list_head evict;
-       spinlock_t evict_lock; /* protect evict and n_locked */
-};
-
-struct hfi1_user_sdma_comp_q {
-       u16 nentries;
-       struct hfi1_sdma_comp_entry *comps;
-};
-
-int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
-int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
-                                  unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
deleted file mode 100644 (file)
index 9cdc85f..0000000
+++ /dev/null
@@ -1,1764 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_user_verbs.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/utsname.h>
-#include <linux/rculist.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "common.h"
-#include "device.h"
-#include "trace.h"
-#include "qp.h"
-#include "verbs_txreq.h"
-
-static unsigned int hfi1_lkey_table_size = 16;
-module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
-                  S_IRUGO);
-MODULE_PARM_DESC(lkey_table_size,
-                "LKEY table size in bits (2^n, 1 <= n <= 23)");
-
-static unsigned int hfi1_max_pds = 0xFFFF;
-module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
-MODULE_PARM_DESC(max_pds,
-                "Maximum number of protection domains to support");
-
-static unsigned int hfi1_max_ahs = 0xFFFF;
-module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
-
-unsigned int hfi1_max_cqes = 0x2FFFF;
-module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
-MODULE_PARM_DESC(max_cqes,
-                "Maximum number of completion queue entries to support");
-
-unsigned int hfi1_max_cqs = 0x1FFFF;
-module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
-
-unsigned int hfi1_max_qp_wrs = 0x3FFF;
-module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
-
-unsigned int hfi1_max_qps = 16384;
-module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
-MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
-
-unsigned int hfi1_max_sges = 0x60;
-module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
-MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
-
-unsigned int hfi1_max_mcast_grps = 16384;
-module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
-MODULE_PARM_DESC(max_mcast_grps,
-                "Maximum number of multicast groups to support");
-
-unsigned int hfi1_max_mcast_qp_attached = 16;
-module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
-                  uint, S_IRUGO);
-MODULE_PARM_DESC(max_mcast_qp_attached,
-                "Maximum number of attached QPs to support");
-
-unsigned int hfi1_max_srqs = 1024;
-module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
-
-unsigned int hfi1_max_srq_sges = 128;
-module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
-
-unsigned int hfi1_max_srq_wrs = 0x1FFFF;
-module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
-MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
-
-unsigned short piothreshold = 256;
-module_param(piothreshold, ushort, S_IRUGO);
-MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
-
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE  2
-static unsigned int sge_copy_mode;
-module_param(sge_copy_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(sge_copy_mode,
-                "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
-
-static void verbs_sdma_complete(
-       struct sdma_txreq *cookie,
-       int status);
-
-static int pio_wait(struct rvt_qp *qp,
-                   struct send_context *sc,
-                   struct hfi1_pkt_state *ps,
-                   u32 flag);
-
-/* Length of buffer to create verbs txreq cache name */
-#define TXREQ_NAME_LEN 24
-
-static uint wss_threshold;
-module_param(wss_threshold, uint, S_IRUGO);
-MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
-static uint wss_clean_period = 256;
-module_param(wss_clean_period, uint, S_IRUGO);
-MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
-
-/* memory working set size */
-struct hfi1_wss {
-       unsigned long *entries;
-       atomic_t total_count;
-       atomic_t clean_counter;
-       atomic_t clean_entry;
-
-       int threshold;
-       int num_entries;
-       long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
-       long llc_size;
-       long llc_bits;
-       long table_size;
-       long table_bits;
-
-       /* check for a valid percent range - default to 80 if none or invalid */
-       if (wss_threshold < 1 || wss_threshold > 100)
-               wss_threshold = 80;
-       /* reject a wildly large period */
-       if (wss_clean_period > 1000000)
-               wss_clean_period = 256;
-       /* reject a zero period */
-       if (wss_clean_period == 0)
-               wss_clean_period = 1;
-
-       /*
-        * Calculate the table size - the next power of 2 larger than the
-        * LLC size.  LLC size is in KiB.
-        */
-       llc_size = wss_llc_size() * 1024;
-       table_size = roundup_pow_of_two(llc_size);
-
-       /* one bit per page in rounded up table */
-       llc_bits = llc_size / PAGE_SIZE;
-       table_bits = table_size / PAGE_SIZE;
-       wss.pages_mask = table_bits - 1;
-       wss.num_entries = table_bits / BITS_PER_LONG;
-
-       wss.threshold = (llc_bits * wss_threshold) / 100;
-       if (wss.threshold == 0)
-               wss.threshold = 1;
-
-       atomic_set(&wss.clean_counter, wss_clean_period);
-
-       wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
-                             GFP_KERNEL);
-       if (!wss.entries) {
-               hfi1_wss_exit();
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-void hfi1_wss_exit(void)
-{
-       /* coded to handle partially initialized and repeat callers */
-       kfree(wss.entries);
-       wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter.  When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking.  Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information.  Since this is only a heuristic, this is
- * OK.  Any innaccuracies will clean themselves out as the counter
- * advances.  That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero.  When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
-       int entry;
-       int weight;
-       unsigned long bits;
-
-       /* become the cleaner if we decrement the counter to zero */
-       if (atomic_dec_and_test(&wss.clean_counter)) {
-               /*
-                * Set, not add, the clean period.  This avoids an issue
-                * where the counter could decrement below the clean period.
-                * Doing a set can result in lost decrements, slowing the
-                * clean advance.  Since this a heuristic, this possible
-                * slowdown is OK.
-                *
-                * An alternative is to loop, advancing the counter by a
-                * clean period until the result is > 0. However, this could
-                * lead to several threads keeping another in the clean loop.
-                * This could be mitigated by limiting the number of times
-                * we stay in the loop.
-                */
-               atomic_set(&wss.clean_counter, wss_clean_period);
-
-               /*
-                * Uniquely grab the entry to clean and move to next.
-                * The current entry is always the lower bits of
-                * wss.clean_entry.  The table size, wss.num_entries,
-                * is always a power-of-2.
-                */
-               entry = (atomic_inc_return(&wss.clean_entry) - 1)
-                       & (wss.num_entries - 1);
-
-               /* clear the entry and count the bits */
-               bits = xchg(&wss.entries[entry], 0);
-               weight = hweight64((u64)bits);
-               /* only adjust the contended total count if needed */
-               if (weight)
-                       atomic_sub(weight, &wss.total_count);
-       }
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
-       u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
-       u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
-       u32 nr = page & (BITS_PER_LONG - 1);
-
-       if (!test_and_set_bit(nr, &wss.entries[entry]))
-               atomic_inc(&wss.total_count);
-
-       wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline int wss_exceeds_threshold(void)
-{
-       return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
-       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-       [IB_WR_SEND] = IB_WC_SEND,
-       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
-/*
- * Length of header by opcode, 0 --> not supported
- */
-const u8 hdr_len_by_opcode[256] = {
-       /* RC */
-       [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
-       [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
-       [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
-       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
-       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
-       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
-       [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
-       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
-       [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
-       [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
-       /* UC */
-       [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
-       [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
-       [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
-       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
-       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
-       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
-       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
-       /* UD */
-       [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
-       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
-};
-
-static const opcode_handler opcode_handler_tbl[256] = {
-       /* RC */
-       [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
-       [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
-       /* UC */
-       [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
-       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
-       /* UD */
-       [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
-       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
-       /* CNP */
-       [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
-};
-
-/*
- * System image GUID.
- */
-__be64 ib_hfi1_sys_image_guid;
-
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
-       struct rvt_sge_state *ss,
-       void *data, u32 length,
-       int release,
-       int copy_last)
-{
-       struct rvt_sge *sge = &ss->sge;
-       int in_last = 0;
-       int i;
-       int cacheless_copy = 0;
-
-       if (sge_copy_mode == COPY_CACHELESS) {
-               cacheless_copy = length >= PAGE_SIZE;
-       } else if (sge_copy_mode == COPY_ADAPTIVE) {
-               if (length >= PAGE_SIZE) {
-                       /*
-                        * NOTE: this *assumes*:
-                        * o The first vaddr is the dest.
-                        * o If multiple pages, then vaddr is sequential.
-                        */
-                       wss_insert(sge->vaddr);
-                       if (length >= (2 * PAGE_SIZE))
-                               wss_insert(sge->vaddr + PAGE_SIZE);
-
-                       cacheless_copy = wss_exceeds_threshold();
-               } else {
-                       wss_advance_clean_counter();
-               }
-       }
-       if (copy_last) {
-               if (length > 8) {
-                       length -= 8;
-               } else {
-                       copy_last = 0;
-                       in_last = 1;
-               }
-       }
-
-again:
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               if (unlikely(in_last)) {
-                       /* enforce byte transfer ordering */
-                       for (i = 0; i < len; i++)
-                               ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
-               } else if (cacheless_copy) {
-                       cacheless_memcpy(sge->vaddr, data, len);
-               } else {
-                       memcpy(sge->vaddr, data, len);
-               }
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (release)
-                               rvt_put_mr(sge->mr);
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-
-       if (copy_last) {
-               copy_last = 0;
-               in_last = 1;
-               length = 8;
-               goto again;
-       }
-}
-
-/**
- * hfi1_skip_sge - skip over SGE memory
- * @ss: the SGE state
- * @length: the number of bytes to skip
- */
-void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
-{
-       struct rvt_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               WARN_ON_ONCE(len == 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (release)
-                               rvt_put_mr(sge->mr);
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr->lkey) {
-                       if (++sge->n >= RVT_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-}
-
-/*
- * Make sure the QP is ready and able to accept the given opcode.
- */
-static inline int qp_ok(int opcode, struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp;
-
-       if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
-               goto dropit;
-       if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
-           (opcode == IB_OPCODE_CNP))
-               return 1;
-dropit:
-       ibp = &packet->rcd->ppd->ibport_data;
-       ibp->rvp.n_pkt_drops++;
-       return 0;
-}
-
-/**
- * hfi1_ib_rcv - process an incoming packet
- * @packet: data packet information
- *
- * This is called to process an incoming packet at interrupt level.
- *
- * Tlen is the length of the header + data + CRC in bytes.
- */
-void hfi1_ib_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ctxtdata *rcd = packet->rcd;
-       struct hfi1_ib_header *hdr = packet->hdr;
-       u32 tlen = packet->tlen;
-       struct hfi1_pportdata *ppd = rcd->ppd;
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
-       unsigned long flags;
-       u32 qp_num;
-       int lnh;
-       u8 opcode;
-       u16 lid;
-
-       /* Check for GRH */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == HFI1_LRH_BTH) {
-               packet->ohdr = &hdr->u.oth;
-       } else if (lnh == HFI1_LRH_GRH) {
-               u32 vtf;
-
-               packet->ohdr = &hdr->u.l.oth;
-               if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
-                       goto drop;
-               vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
-               if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
-                       goto drop;
-               packet->rcv_flags |= HFI1_HAS_GRH;
-       } else {
-               goto drop;
-       }
-
-       trace_input_ibhdr(rcd->dd, hdr);
-
-       opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
-       inc_opstats(tlen, &rcd->opstats->stats[opcode]);
-
-       /* Get the destination QP number. */
-       qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
-       lid = be16_to_cpu(hdr->lrh[1]);
-       if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                    (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
-               struct rvt_mcast *mcast;
-               struct rvt_mcast_qp *p;
-
-               if (lnh != HFI1_LRH_GRH)
-                       goto drop;
-               mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
-               if (!mcast)
-                       goto drop;
-               list_for_each_entry_rcu(p, &mcast->qp_list, list) {
-                       packet->qp = p->qp;
-                       spin_lock_irqsave(&packet->qp->r_lock, flags);
-                       if (likely((qp_ok(opcode, packet))))
-                               opcode_handler_tbl[opcode](packet);
-                       spin_unlock_irqrestore(&packet->qp->r_lock, flags);
-               }
-               /*
-                * Notify rvt_multicast_detach() if it is waiting for us
-                * to finish.
-                */
-               if (atomic_dec_return(&mcast->refcount) <= 1)
-                       wake_up(&mcast->wait);
-       } else {
-               rcu_read_lock();
-               packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
-               if (!packet->qp) {
-                       rcu_read_unlock();
-                       goto drop;
-               }
-               spin_lock_irqsave(&packet->qp->r_lock, flags);
-               if (likely((qp_ok(opcode, packet))))
-                       opcode_handler_tbl[opcode](packet);
-               spin_unlock_irqrestore(&packet->qp->r_lock, flags);
-               rcu_read_unlock();
-       }
-       return;
-
-drop:
-       ibp->rvp.n_pkt_drops++;
-}
-
-/*
- * This is called from a timer to check for QPs
- * which need kernel memory in order to send a packet.
- */
-static void mem_timer(unsigned long data)
-{
-       struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
-       struct list_head *list = &dev->memwait;
-       struct rvt_qp *qp = NULL;
-       struct iowait *wait;
-       unsigned long flags;
-       struct hfi1_qp_priv *priv;
-
-       write_seqlock_irqsave(&dev->iowait_lock, flags);
-       if (!list_empty(list)) {
-               wait = list_first_entry(list, struct iowait, list);
-               qp = iowait_to_qp(wait);
-               priv = qp->priv;
-               list_del_init(&priv->s_iowait.list);
-               /* refcount held until actual wake up */
-               if (!list_empty(list))
-                       mod_timer(&dev->mem_timer, jiffies + 1);
-       }
-       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-
-       if (qp)
-               hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
-}
-
-void update_sge(struct rvt_sge_state *ss, u32 length)
-{
-       struct rvt_sge *sge = &ss->sge;
-
-       sge->vaddr += length;
-       sge->length -= length;
-       sge->sge_length -= length;
-       if (sge->sge_length == 0) {
-               if (--ss->num_sge)
-                       *sge = *ss->sg_list++;
-       } else if (sge->length == 0 && sge->mr->lkey) {
-               if (++sge->n >= RVT_SEGSZ) {
-                       if (++sge->m >= sge->mr->mapsz)
-                               return;
-                       sge->n = 0;
-               }
-               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
-               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
-       }
-}
-
-/*
- * This is called with progress side lock held.
- */
-/* New API */
-static void verbs_sdma_complete(
-       struct sdma_txreq *cookie,
-       int status)
-{
-       struct verbs_txreq *tx =
-               container_of(cookie, struct verbs_txreq, txreq);
-       struct rvt_qp *qp = tx->qp;
-
-       spin_lock(&qp->s_lock);
-       if (tx->wqe) {
-               hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
-       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-               struct hfi1_ib_header *hdr;
-
-               hdr = &tx->phdr.hdr;
-               hfi1_rc_send_complete(qp, hdr);
-       }
-       spin_unlock(&qp->s_lock);
-
-       hfi1_put_txreq(tx);
-}
-
-static int wait_kmem(struct hfi1_ibdev *dev,
-                    struct rvt_qp *qp,
-                    struct hfi1_pkt_state *ps)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       unsigned long flags;
-       int ret = 0;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               write_seqlock(&dev->iowait_lock);
-               list_add_tail(&ps->s_txreq->txreq.list,
-                             &priv->s_iowait.tx_head);
-               if (list_empty(&priv->s_iowait.list)) {
-                       if (list_empty(&dev->memwait))
-                               mod_timer(&dev->mem_timer, jiffies + 1);
-                       qp->s_flags |= RVT_S_WAIT_KMEM;
-                       list_add_tail(&priv->s_iowait.list, &dev->memwait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
-                       atomic_inc(&qp->refcount);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               ret = -EBUSY;
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       return ret;
-}
-
-/*
- * This routine calls txadds for each sg entry.
- *
- * Add failures will revert the sge cursor
- */
-static noinline int build_verbs_ulp_payload(
-       struct sdma_engine *sde,
-       struct rvt_sge_state *ss,
-       u32 length,
-       struct verbs_txreq *tx)
-{
-       struct rvt_sge *sg_list = ss->sg_list;
-       struct rvt_sge sge = ss->sge;
-       u8 num_sge = ss->num_sge;
-       u32 len;
-       int ret = 0;
-
-       while (length) {
-               len = ss->sge.length;
-               if (len > length)
-                       len = length;
-               if (len > ss->sge.sge_length)
-                       len = ss->sge.sge_length;
-               WARN_ON_ONCE(len == 0);
-               ret = sdma_txadd_kvaddr(
-                       sde->dd,
-                       &tx->txreq,
-                       ss->sge.vaddr,
-                       len);
-               if (ret)
-                       goto bail_txadd;
-               update_sge(ss, len);
-               length -= len;
-       }
-       return ret;
-bail_txadd:
-       /* unwind cursor */
-       ss->sge = sge;
-       ss->num_sge = num_sge;
-       ss->sg_list = sg_list;
-       return ret;
-}
-
-/*
- * Build the number of DMA descriptors needed to send length bytes of data.
- *
- * NOTE: DMA mapping is held in the tx until completed in the ring or
- *       the tx desc is freed without having been submitted to the ring
- *
- * This routine ensures all the helper routine calls succeed.
- */
-/* New API */
-static int build_verbs_tx_desc(
-       struct sdma_engine *sde,
-       struct rvt_sge_state *ss,
-       u32 length,
-       struct verbs_txreq *tx,
-       struct ahg_ib_header *ahdr,
-       u64 pbc)
-{
-       int ret = 0;
-       struct hfi1_pio_header *phdr = &tx->phdr;
-       u16 hdrbytes = tx->hdr_dwords << 2;
-
-       if (!ahdr->ahgcount) {
-               ret = sdma_txinit_ahg(
-                       &tx->txreq,
-                       ahdr->tx_flags,
-                       hdrbytes + length,
-                       ahdr->ahgidx,
-                       0,
-                       NULL,
-                       0,
-                       verbs_sdma_complete);
-               if (ret)
-                       goto bail_txadd;
-               phdr->pbc = cpu_to_le64(pbc);
-               ret = sdma_txadd_kvaddr(
-                       sde->dd,
-                       &tx->txreq,
-                       phdr,
-                       hdrbytes);
-               if (ret)
-                       goto bail_txadd;
-       } else {
-               ret = sdma_txinit_ahg(
-                       &tx->txreq,
-                       ahdr->tx_flags,
-                       length,
-                       ahdr->ahgidx,
-                       ahdr->ahgcount,
-                       ahdr->ahgdesc,
-                       hdrbytes,
-                       verbs_sdma_complete);
-               if (ret)
-                       goto bail_txadd;
-       }
-
-       /* add the ulp payload - if any.  ss can be NULL for acks */
-       if (ss)
-               ret = build_verbs_ulp_payload(sde, ss, length, tx);
-bail_txadd:
-       return ret;
-}
-
-int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct ahg_ib_header *ahdr = priv->s_hdr;
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
-       struct hfi1_ibdev *dev = ps->dev;
-       struct hfi1_pportdata *ppd = ps->ppd;
-       struct verbs_txreq *tx;
-       u64 pbc_flags = 0;
-       u8 sc5 = priv->s_sc;
-
-       int ret;
-
-       tx = ps->s_txreq;
-       if (!sdma_txreq_built(&tx->txreq)) {
-               if (likely(pbc == 0)) {
-                       u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
-                       /* No vl15 here */
-                       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-                       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-
-                       pbc = create_pbc(ppd,
-                                        pbc_flags,
-                                        qp->srate_mbps,
-                                        vl,
-                                        plen);
-               }
-               tx->wqe = qp->s_wqe;
-               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
-               if (unlikely(ret))
-                       goto bail_build;
-       }
-       ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
-       if (unlikely(ret < 0)) {
-               if (ret == -ECOMM)
-                       goto bail_ecomm;
-               return ret;
-       }
-       trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-                               &ps->s_txreq->phdr.hdr);
-       return ret;
-
-bail_ecomm:
-       /* The current one got "sent" */
-       return 0;
-bail_build:
-       ret = wait_kmem(dev, qp, ps);
-       if (!ret) {
-               /* free txreq - bad state */
-               hfi1_put_txreq(ps->s_txreq);
-               ps->s_txreq = NULL;
-       }
-       return ret;
-}
-
-/*
- * If we are now in the error state, return zero to flush the
- * send work request.
- */
-static int pio_wait(struct rvt_qp *qp,
-                   struct send_context *sc,
-                   struct hfi1_pkt_state *ps,
-                   u32 flag)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_devdata *dd = sc->dd;
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       unsigned long flags;
-       int ret = 0;
-
-       /*
-        * Note that as soon as want_buffer() is called and
-        * possibly before it returns, sc_piobufavail()
-        * could be called. Therefore, put QP on the I/O wait list before
-        * enabling the PIO avail interrupt.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               write_seqlock(&dev->iowait_lock);
-               list_add_tail(&ps->s_txreq->txreq.list,
-                             &priv->s_iowait.tx_head);
-               if (list_empty(&priv->s_iowait.list)) {
-                       struct hfi1_ibdev *dev = &dd->verbs_dev;
-                       int was_empty;
-
-                       dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
-                       dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
-                       dev->n_piowait++;
-                       qp->s_flags |= flag;
-                       was_empty = list_empty(&sc->piowait);
-                       list_add_tail(&priv->s_iowait.list, &sc->piowait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
-                       atomic_inc(&qp->refcount);
-                       /* counting: only call wantpiobuf_intr if first user */
-                       if (was_empty)
-                               hfi1_sc_wantpiobuf_intr(sc, 1);
-               }
-               write_sequnlock(&dev->iowait_lock);
-               qp->s_flags &= ~RVT_S_BUSY;
-               ret = -EBUSY;
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-static void verbs_pio_complete(void *arg, int code)
-{
-       struct rvt_qp *qp = (struct rvt_qp *)arg;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       if (iowait_pio_dec(&priv->s_iowait))
-               iowait_drain_wakeup(&priv->s_iowait);
-}
-
-int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc)
-{
-       struct hfi1_qp_priv *priv = qp->priv;
-       u32 hdrwords = qp->s_hdrwords;
-       struct rvt_sge_state *ss = qp->s_cur_sge;
-       u32 len = qp->s_cur_size;
-       u32 dwords = (len + 3) >> 2;
-       u32 plen = hdrwords + dwords + 2; /* includes pbc */
-       struct hfi1_pportdata *ppd = ps->ppd;
-       u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
-       u64 pbc_flags = 0;
-       u8 sc5;
-       unsigned long flags = 0;
-       struct send_context *sc;
-       struct pio_buf *pbuf;
-       int wc_status = IB_WC_SUCCESS;
-       int ret = 0;
-       pio_release_cb cb = NULL;
-
-       /* only RC/UC use complete */
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_RC:
-       case IB_QPT_UC:
-               cb = verbs_pio_complete;
-               break;
-       default:
-               break;
-       }
-
-       /* vl15 special case taken care of in ud.c */
-       sc5 = priv->s_sc;
-       sc = ps->s_txreq->psc;
-
-       if (likely(pbc == 0)) {
-               u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
-               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
-               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
-       }
-       if (cb)
-               iowait_pio_inc(&priv->s_iowait);
-       pbuf = sc_buffer_alloc(sc, plen, cb, qp);
-       if (unlikely(!pbuf)) {
-               if (cb)
-                       verbs_pio_complete(qp, 0);
-               if (ppd->host_link_state != HLS_UP_ACTIVE) {
-                       /*
-                        * If we have filled the PIO buffers to capacity and are
-                        * not in an active state this request is not going to
-                        * go out to so just complete it with an error or else a
-                        * ULP or the core may be stuck waiting.
-                        */
-                       hfi1_cdbg(
-                               PIO,
-                               "alloc failed. state not active, completing");
-                       wc_status = IB_WC_GENERAL_ERR;
-                       goto pio_bail;
-               } else {
-                       /*
-                        * This is a normal occurrence. The PIO buffs are full
-                        * up but we are still happily sending, well we could be
-                        * so lets continue to queue the request.
-                        */
-                       hfi1_cdbg(PIO, "alloc failed. state active, queuing");
-                       ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
-                       if (!ret)
-                               /* txreq not queued - free */
-                               goto bail;
-                       /* tx consumed in wait */
-                       return ret;
-               }
-       }
-
-       if (len == 0) {
-               pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
-       } else {
-               if (ss) {
-                       seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4);
-                       while (len) {
-                               void *addr = ss->sge.vaddr;
-                               u32 slen = ss->sge.length;
-
-                               if (slen > len)
-                                       slen = len;
-                               update_sge(ss, slen);
-                               seg_pio_copy_mid(pbuf, addr, slen);
-                               len -= slen;
-                       }
-                       seg_pio_copy_end(pbuf);
-               }
-       }
-
-       trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-                              &ps->s_txreq->phdr.hdr);
-
-pio_bail:
-       if (qp->s_wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_send_complete(qp, qp->s_wqe, wc_status);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-
-       ret = 0;
-
-bail:
-       hfi1_put_txreq(ps->s_txreq);
-       return ret;
-}
-
-/*
- * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
- * being an entry from the partition key table), return 0
- * otherwise. Use the matching criteria for egress partition keys
- * specified in the OPAv1 spec., section 9.1l.7.
- */
-static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
-{
-       u16 mkey = pkey & PKEY_LOW_15_MASK;
-       u16 mentry = ent & PKEY_LOW_15_MASK;
-
-       if (mkey == mentry) {
-               /*
-                * If pkey[15] is set (full partition member),
-                * is bit 15 in the corresponding table element
-                * clear (limited member)?
-                */
-               if (pkey & PKEY_MEMBER_MASK)
-                       return !!(ent & PKEY_MEMBER_MASK);
-               return 1;
-       }
-       return 0;
-}
-
-/**
- * egress_pkey_check - check P_KEY of a packet
- * @ppd:    Physical IB port data
- * @lrh: Local route header
- * @bth: Base transport header
- * @sc5:    SC for packet
- * @s_pkey_index: It will be used for look up optimization for kernel contexts
- * only. If it is negative value, then it means user contexts is calling this
- * function.
- *
- * It checks if hdr's pkey is valid.
- *
- * Return: 0 on success, otherwise, 1
- */
-int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
-                     u8 sc5, int8_t s_pkey_index)
-{
-       struct hfi1_devdata *dd;
-       int i;
-       u16 pkey;
-       int is_user_ctxt_mechanism = (s_pkey_index < 0);
-
-       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
-               return 0;
-
-       pkey = (u16)be32_to_cpu(bth[0]);
-
-       /* If SC15, pkey[0:14] must be 0x7fff */
-       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
-               goto bad;
-
-       /* Is the pkey = 0x0, or 0x8000? */
-       if ((pkey & PKEY_LOW_15_MASK) == 0)
-               goto bad;
-
-       /*
-        * For the kernel contexts only, if a qp is passed into the function,
-        * the most likely matching pkey has index qp->s_pkey_index
-        */
-       if (!is_user_ctxt_mechanism &&
-           egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
-               return 0;
-       }
-
-       for (i = 0; i < MAX_PKEY_VALUES; i++) {
-               if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
-                       return 0;
-       }
-bad:
-       /*
-        * For the user-context mechanism, the P_KEY check would only happen
-        * once per SDMA request, not once per packet.  Therefore, there's no
-        * need to increment the counter for the user-context mechanism.
-        */
-       if (!is_user_ctxt_mechanism) {
-               incr_cntr64(&ppd->port_xmit_constraint_errors);
-               dd = ppd->dd;
-               if (!(dd->err_info_xmit_constraint.status &
-                     OPA_EI_STATUS_SMASK)) {
-                       u16 slid = be16_to_cpu(lrh[3]);
-
-                       dd->err_info_xmit_constraint.status |=
-                               OPA_EI_STATUS_SMASK;
-                       dd->err_info_xmit_constraint.slid = slid;
-                       dd->err_info_xmit_constraint.pkey = pkey;
-               }
-       }
-       return 1;
-}
-
-/**
- * get_send_routine - choose an egress routine
- *
- * Choose an egress routine based on QP type
- * and size
- */
-static inline send_routine get_send_routine(struct rvt_qp *qp,
-                                           struct verbs_txreq *tx)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_ib_header *h = &tx->phdr.hdr;
-
-       if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
-               return dd->process_pio_send;
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-               return dd->process_pio_send;
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               break;
-       case IB_QPT_RC:
-               if (piothreshold &&
-                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
-                   (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
-                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
-                   !sdma_txreq_built(&tx->txreq))
-                       return dd->process_pio_send;
-               break;
-       case IB_QPT_UC:
-               if (piothreshold &&
-                   qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
-                   (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
-                   iowait_sdma_pending(&priv->s_iowait) == 0 &&
-                   !sdma_txreq_built(&tx->txreq))
-                       return dd->process_pio_send;
-               break;
-       default:
-               break;
-       }
-       return dd->process_dma_send;
-}
-
-/**
- * hfi1_verbs_send - send a packet
- * @qp: the QP to send on
- * @ps: the state of the packet to send
- *
- * Return zero if packet is sent or queued OK.
- * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
- */
-int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
-       struct hfi1_qp_priv *priv = qp->priv;
-       struct hfi1_other_headers *ohdr;
-       struct hfi1_ib_header *hdr;
-       send_routine sr;
-       int ret;
-       u8 lnh;
-
-       hdr = &ps->s_txreq->phdr.hdr;
-       /* locate the pkey within the headers */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == HFI1_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else
-               ohdr = &hdr->u.oth;
-
-       sr = get_send_routine(qp, ps->s_txreq);
-       ret = egress_pkey_check(dd->pport,
-                               hdr->lrh,
-                               ohdr->bth,
-                               priv->s_sc,
-                               qp->s_pkey_index);
-       if (unlikely(ret)) {
-               /*
-                * The value we are returning here does not get propagated to
-                * the verbs caller. Thus we need to complete the request with
-                * error otherwise the caller could be sitting waiting on the
-                * completion event. Only do this for PIO. SDMA has its own
-                * mechanism for handling the errors. So for SDMA we can just
-                * return.
-                */
-               if (sr == dd->process_pio_send) {
-                       unsigned long flags;
-
-                       hfi1_cdbg(PIO, "%s() Failed. Completing with err",
-                                 __func__);
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-               }
-               return -EINVAL;
-       }
-       if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
-               return pio_wait(qp,
-                               ps->s_txreq->psc,
-                               ps,
-                               RVT_S_WAIT_PIO_DRAIN);
-       return sr(qp, ps, 0);
-}
-
-/**
- * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
- * @dd: the device data structure
- */
-static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
-{
-       struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
-
-       memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
-
-       rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
-                       IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-                       IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
-       rdi->dparms.props.page_size_cap = PAGE_SIZE;
-       rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
-       rdi->dparms.props.vendor_part_id = dd->pcidev->device;
-       rdi->dparms.props.hw_ver = dd->minrev;
-       rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
-       rdi->dparms.props.max_mr_size = ~0ULL;
-       rdi->dparms.props.max_qp = hfi1_max_qps;
-       rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
-       rdi->dparms.props.max_sge = hfi1_max_sges;
-       rdi->dparms.props.max_sge_rd = hfi1_max_sges;
-       rdi->dparms.props.max_cq = hfi1_max_cqs;
-       rdi->dparms.props.max_ah = hfi1_max_ahs;
-       rdi->dparms.props.max_cqe = hfi1_max_cqes;
-       rdi->dparms.props.max_mr = rdi->lkey_table.max;
-       rdi->dparms.props.max_fmr = rdi->lkey_table.max;
-       rdi->dparms.props.max_map_per_fmr = 32767;
-       rdi->dparms.props.max_pd = hfi1_max_pds;
-       rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
-       rdi->dparms.props.max_qp_init_rd_atom = 255;
-       rdi->dparms.props.max_srq = hfi1_max_srqs;
-       rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
-       rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
-       rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
-       rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
-       rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
-       rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
-       rdi->dparms.props.max_total_mcast_qp_attach =
-                                       rdi->dparms.props.max_mcast_qp_attach *
-                                       rdi->dparms.props.max_mcast_grp;
-}
-
-static inline u16 opa_speed_to_ib(u16 in)
-{
-       u16 out = 0;
-
-       if (in & OPA_LINK_SPEED_25G)
-               out |= IB_SPEED_EDR;
-       if (in & OPA_LINK_SPEED_12_5G)
-               out |= IB_SPEED_FDR;
-
-       return out;
-}
-
-/*
- * Convert a single OPA link width (no multiple flags) to an IB value.
- * A zero OPA link width means link down, which means the IB width value
- * is a don't care.
- */
-static inline u16 opa_width_to_ib(u16 in)
-{
-       switch (in) {
-       case OPA_LINK_WIDTH_1X:
-       /* map 2x and 3x to 1x as they don't exist in IB */
-       case OPA_LINK_WIDTH_2X:
-       case OPA_LINK_WIDTH_3X:
-               return IB_WIDTH_1X;
-       default: /* link down or unknown, return our largest width */
-       case OPA_LINK_WIDTH_4X:
-               return IB_WIDTH_4X;
-       }
-}
-
-static int query_port(struct rvt_dev_info *rdi, u8 port_num,
-                     struct ib_port_attr *props)
-{
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
-       u16 lid = ppd->lid;
-
-       props->lid = lid ? lid : 0;
-       props->lmc = ppd->lmc;
-       /* OPA logical states match IB logical states */
-       props->state = driver_lstate(ppd);
-       props->phys_state = hfi1_ibphys_portstate(ppd);
-       props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
-       props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
-       /* see rate_show() in ib core/sysfs.c */
-       props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
-       props->max_vl_num = ppd->vls_supported;
-
-       /* Once we are a "first class" citizen and have added the OPA MTUs to
-        * the core we can advertise the larger MTU enum to the ULPs, for now
-        * advertise only 4K.
-        *
-        * Those applications which are either OPA aware or pass the MTU enum
-        * from the Path Records to us will get the new 8k MTU.  Those that
-        * attempt to process the MTU enum may fail in various ways.
-        */
-       props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
-                                     4096 : hfi1_max_mtu), IB_MTU_4096);
-       props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
-               mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
-
-       return 0;
-}
-
-static int modify_device(struct ib_device *device,
-                        int device_modify_mask,
-                        struct ib_device_modify *device_modify)
-{
-       struct hfi1_devdata *dd = dd_from_ibdev(device);
-       unsigned i;
-       int ret;
-
-       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
-                                  IB_DEVICE_MODIFY_NODE_DESC)) {
-               ret = -EOPNOTSUPP;
-               goto bail;
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
-               memcpy(device->node_desc, device_modify->node_desc, 64);
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
-
-                       hfi1_node_desc_chg(ibp);
-               }
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
-               ib_hfi1_sys_image_guid =
-                       cpu_to_be64(device_modify->sys_image_guid);
-               for (i = 0; i < dd->num_pports; i++) {
-                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
-
-                       hfi1_sys_guid_chg(ibp);
-               }
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
-{
-       struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
-       struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
-       struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
-       int ret;
-
-       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
-                            OPA_LINKDOWN_REASON_UNKNOWN);
-       ret = set_link_state(ppd, HLS_DN_DOWNDEF);
-       return ret;
-}
-
-static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
-                           int guid_index, __be64 *guid)
-{
-       struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-       if (guid_index == 0)
-               *guid = cpu_to_be64(ppd->guid);
-       else if (guid_index < HFI1_GUIDS_PER_PORT)
-               *guid = ibp->guids[guid_index - 1];
-       else
-               return -EINVAL;
-
-       return 0;
-}
-
-/*
- * convert ah port,sl to sc
- */
-u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
-{
-       struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
-
-       return ibp->sl_to_sc[ah->sl];
-}
-
-static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
-{
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u8 sc5;
-
-       /* test the mapping for validity */
-       ibp = to_iport(ibdev, ah_attr->port_num);
-       ppd = ppd_from_ibp(ibp);
-       sc5 = ibp->sl_to_sc[ah_attr->sl];
-       dd = dd_from_ppd(ppd);
-       if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
-               return -EINVAL;
-       return 0;
-}
-
-static void hfi1_notify_new_ah(struct ib_device *ibdev,
-                              struct ib_ah_attr *ah_attr,
-                              struct rvt_ah *ah)
-{
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct hfi1_devdata *dd;
-       u8 sc5;
-
-       /*
-        * Do not trust reading anything from rvt_ah at this point as it is not
-        * done being setup. We can however modify things which we need to set.
-        */
-
-       ibp = to_iport(ibdev, ah_attr->port_num);
-       ppd = ppd_from_ibp(ibp);
-       sc5 = ibp->sl_to_sc[ah->attr.sl];
-       dd = dd_from_ppd(ppd);
-       ah->vl = sc_to_vlt(dd, sc5);
-       if (ah->vl < num_vls || ah->vl == 15)
-               ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
-}
-
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
-{
-       struct ib_ah_attr attr;
-       struct ib_ah *ah = ERR_PTR(-EINVAL);
-       struct rvt_qp *qp0;
-
-       memset(&attr, 0, sizeof(attr));
-       attr.dlid = dlid;
-       attr.port_num = ppd_from_ibp(ibp)->port;
-       rcu_read_lock();
-       qp0 = rcu_dereference(ibp->rvp.qp[0]);
-       if (qp0)
-               ah = ib_create_ah(qp0->ibqp.pd, &attr);
-       rcu_read_unlock();
-       return ah;
-}
-
-/**
- * hfi1_get_npkeys - return the size of the PKEY table for context 0
- * @dd: the hfi1_ib device
- */
-unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
-{
-       return ARRAY_SIZE(dd->pport[0].pkeys);
-}
-
-static void init_ibport(struct hfi1_pportdata *ppd)
-{
-       struct hfi1_ibport *ibp = &ppd->ibport_data;
-       size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
-       int i;
-
-       for (i = 0; i < sz; i++) {
-               ibp->sl_to_sc[i] = i;
-               ibp->sc_to_sl[i] = i;
-       }
-
-       spin_lock_init(&ibp->rvp.lock);
-       /* Set the prefix to the default value (see ch. 4.1.1) */
-       ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
-       ibp->rvp.sm_lid = 0;
-       /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
-       ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
-               IB_PORT_CAP_MASK_NOTICE_SUP;
-       ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
-       ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
-       ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
-       ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
-       ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
-
-       RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
-       RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
-}
-
-/**
- * hfi1_register_ib_device - register our device with the infiniband core
- * @dd: the device data structure
- * Return 0 if successful, errno if unsuccessful.
- */
-int hfi1_register_ib_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-       struct ib_device *ibdev = &dev->rdi.ibdev;
-       struct hfi1_pportdata *ppd = dd->pport;
-       unsigned i;
-       int ret;
-       size_t lcpysz = IB_DEVICE_NAME_MAX;
-
-       for (i = 0; i < dd->num_pports; i++)
-               init_ibport(ppd + i);
-
-       /* Only need to initialize non-zero fields. */
-
-       setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
-
-       seqlock_init(&dev->iowait_lock);
-       INIT_LIST_HEAD(&dev->txwait);
-       INIT_LIST_HEAD(&dev->memwait);
-
-       ret = verbs_txreq_init(dev);
-       if (ret)
-               goto err_verbs_txreq;
-
-       /*
-        * The system image GUID is supposed to be the same for all
-        * HFIs in a single system but since there can be other
-        * device types in the system, we can't be sure this is unique.
-        */
-       if (!ib_hfi1_sys_image_guid)
-               ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
-       lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
-       strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
-       ibdev->owner = THIS_MODULE;
-       ibdev->node_guid = cpu_to_be64(ppd->guid);
-       ibdev->phys_port_cnt = dd->num_pports;
-       ibdev->dma_device = &dd->pcidev->dev;
-       ibdev->modify_device = modify_device;
-
-       /* keep process mad in the driver */
-       ibdev->process_mad = hfi1_process_mad;
-
-       strncpy(ibdev->node_desc, init_utsname()->nodename,
-               sizeof(ibdev->node_desc));
-
-       /*
-        * Fill in rvt info object.
-        */
-       dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
-       dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
-       dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
-       dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
-       dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
-       dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
-       dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
-       dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
-       dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
-       /*
-        * Fill in rvt info device attributes.
-        */
-       hfi1_fill_device_attr(dd);
-
-       /* queue pair */
-       dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
-       dd->verbs_dev.rdi.dparms.qpn_start = 0;
-       dd->verbs_dev.rdi.dparms.qpn_inc = 1;
-       dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
-       dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
-       dd->verbs_dev.rdi.dparms.qpn_res_end =
-       dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
-       dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
-       dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
-       dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
-       dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
-       dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
-       dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
-
-       dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
-       dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
-       dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
-       dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
-       dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
-       dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
-       dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
-       dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
-       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
-       dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
-       dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
-       dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
-       dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
-       dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
-       dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
-       dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
-       dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
-       dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
-
-       /* completeion queue */
-       snprintf(dd->verbs_dev.rdi.dparms.cq_name,
-                sizeof(dd->verbs_dev.rdi.dparms.cq_name),
-                "hfi1_cq%d", dd->unit);
-       dd->verbs_dev.rdi.dparms.node = dd->node;
-
-       /* misc settings */
-       dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
-       dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
-       dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
-       dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
-
-       ppd = dd->pport;
-       for (i = 0; i < dd->num_pports; i++, ppd++)
-               rvt_init_port(&dd->verbs_dev.rdi,
-                             &ppd->ibport_data.rvp,
-                             i,
-                             ppd->pkeys);
-
-       ret = rvt_register_device(&dd->verbs_dev.rdi);
-       if (ret)
-               goto err_verbs_txreq;
-
-       ret = hfi1_verbs_register_sysfs(dd);
-       if (ret)
-               goto err_class;
-
-       return ret;
-
-err_class:
-       rvt_unregister_device(&dd->verbs_dev.rdi);
-err_verbs_txreq:
-       verbs_txreq_exit(dev);
-       dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
-       return ret;
-}
-
-void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
-{
-       struct hfi1_ibdev *dev = &dd->verbs_dev;
-
-       hfi1_verbs_unregister_sysfs(dd);
-
-       rvt_unregister_device(&dd->verbs_dev.rdi);
-
-       if (!list_empty(&dev->txwait))
-               dd_dev_err(dd, "txwait list not empty!\n");
-       if (!list_empty(&dev->memwait))
-               dd_dev_err(dd, "memwait list not empty!\n");
-
-       del_timer_sync(&dev->mem_timer);
-       verbs_txreq_exit(dev);
-}
-
-void hfi1_cnp_rcv(struct hfi1_packet *packet)
-{
-       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-       struct hfi1_ib_header *hdr = packet->hdr;
-       struct rvt_qp *qp = packet->qp;
-       u32 lqpn, rqpn = 0;
-       u16 rlid = 0;
-       u8 sl, sc5, sc4_bit, svc_type;
-       bool sc4_set = has_sc4_bit(packet);
-
-       switch (packet->qp->ibqp.qp_type) {
-       case IB_QPT_UC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_UC;
-               break;
-       case IB_QPT_RC:
-               rlid = qp->remote_ah_attr.dlid;
-               rqpn = qp->remote_qpn;
-               svc_type = IB_CC_SVCTYPE_RC;
-               break;
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               svc_type = IB_CC_SVCTYPE_UD;
-               break;
-       default:
-               ibp->rvp.n_pkt_drops++;
-               return;
-       }
-
-       sc4_bit = sc4_set << 4;
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc5 |= sc4_bit;
-       sl = ibp->sc_to_sl[sc5];
-       lqpn = qp->ibqp.qp_num;
-
-       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
-}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
deleted file mode 100644 (file)
index 3ee2239..0000000
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_VERBS_H
-#define HFI1_VERBS_H
-
-#include <linux/types.h>
-#include <linux/seqlock.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/kref.h>
-#include <linux/workqueue.h>
-#include <linux/kthread.h>
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_mad.h>
-#include <rdma/rdma_vt.h>
-#include <rdma/rdmavt_qp.h>
-#include <rdma/rdmavt_cq.h>
-
-struct hfi1_ctxtdata;
-struct hfi1_pportdata;
-struct hfi1_devdata;
-struct hfi1_packet;
-
-#include "iowait.h"
-
-#define HFI1_MAX_RDMA_ATOMIC     16
-#define HFI1_GUIDS_PER_PORT    5
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define HFI1_UVERBS_ABI_VERSION       2
-
-#define IB_SEQ_NAK     (3 << 29)
-
-/* AETH NAK opcode values */
-#define IB_RNR_NAK                      0x20
-#define IB_NAK_PSN_ERROR                0x60
-#define IB_NAK_INVALID_REQUEST          0x61
-#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
-#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
-#define IB_NAK_INVALID_RD_REQUEST       0x64
-
-/* IB Performance Manager status values */
-#define IB_PMA_SAMPLE_STATUS_DONE       0x00
-#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
-#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
-
-/* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
-#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
-#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
-
-#define HFI1_VENDOR_IPG                cpu_to_be16(0xFFA0)
-
-#define IB_BTH_REQ_ACK         BIT(31)
-#define IB_BTH_SOLICITED       BIT(23)
-#define IB_BTH_MIG_REQ         BIT(22)
-
-#define IB_GRH_VERSION         6
-#define IB_GRH_VERSION_MASK    0xF
-#define IB_GRH_VERSION_SHIFT   28
-#define IB_GRH_TCLASS_MASK     0xFF
-#define IB_GRH_TCLASS_SHIFT    20
-#define IB_GRH_FLOW_MASK       0xFFFFF
-#define IB_GRH_FLOW_SHIFT      0
-#define IB_GRH_NEXT_HDR                0x1B
-
-#define IB_DEFAULT_GID_PREFIX  cpu_to_be64(0xfe80000000000000ULL)
-
-/* flags passed by hfi1_ib_rcv() */
-enum {
-       HFI1_HAS_GRH = (1 << 0),
-};
-
-struct ib_reth {
-       __be64 vaddr;
-       __be32 rkey;
-       __be32 length;
-} __packed;
-
-struct ib_atomic_eth {
-       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
-       __be32 rkey;
-       __be64 swap_data;
-       __be64 compare_data;
-} __packed;
-
-union ib_ehdrs {
-       struct {
-               __be32 deth[2];
-               __be32 imm_data;
-       } ud;
-       struct {
-               struct ib_reth reth;
-               __be32 imm_data;
-       } rc;
-       struct {
-               __be32 aeth;
-               __be32 atomic_ack_eth[2];
-       } at;
-       __be32 imm_data;
-       __be32 aeth;
-       struct ib_atomic_eth atomic_eth;
-}  __packed;
-
-struct hfi1_other_headers {
-       __be32 bth[3];
-       union ib_ehdrs u;
-} __packed;
-
-/*
- * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
- * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
- * will be in the eager header buffer.  The remaining 12 or 16 bytes
- * are in the data buffer.
- */
-struct hfi1_ib_header {
-       __be16 lrh[4];
-       union {
-               struct {
-                       struct ib_grh grh;
-                       struct hfi1_other_headers oth;
-               } l;
-               struct hfi1_other_headers oth;
-       } u;
-} __packed;
-
-struct ahg_ib_header {
-       struct sdma_engine *sde;
-       u32 ahgdesc[2];
-       u16 tx_flags;
-       u8 ahgcount;
-       u8 ahgidx;
-       struct hfi1_ib_header ibh;
-};
-
-struct hfi1_pio_header {
-       __le64 pbc;
-       struct hfi1_ib_header hdr;
-} __packed;
-
-/*
- * hfi1 specific data structures that will be hidden from rvt after the queue
- * pair is made common
- */
-struct hfi1_qp_priv {
-       struct ahg_ib_header *s_hdr;              /* next header to send */
-       struct sdma_engine *s_sde;                /* current sde */
-       struct send_context *s_sendcontext;       /* current sendcontext */
-       u8 s_sc;                                  /* SC[0..4] for next packet */
-       u8 r_adefered;                            /* number of acks defered */
-       struct iowait s_iowait;
-       struct timer_list s_rnr_timer;
-       struct rvt_qp *owner;
-};
-
-/*
- * This structure is used to hold commonly lookedup and computed values during
- * the send engine progress.
- */
-struct hfi1_pkt_state {
-       struct hfi1_ibdev *dev;
-       struct hfi1_ibport *ibp;
-       struct hfi1_pportdata *ppd;
-       struct verbs_txreq *s_txreq;
-       unsigned long flags;
-};
-
-#define HFI1_PSN_CREDIT  16
-
-struct hfi1_opcode_stats {
-       u64 n_packets;          /* number of packets */
-       u64 n_bytes;            /* total number of bytes */
-};
-
-struct hfi1_opcode_stats_perctx {
-       struct hfi1_opcode_stats stats[256];
-};
-
-static inline void inc_opstats(
-       u32 tlen,
-       struct hfi1_opcode_stats *stats)
-{
-#ifdef CONFIG_DEBUG_FS
-       stats->n_bytes += tlen;
-       stats->n_packets++;
-#endif
-}
-
-struct hfi1_ibport {
-       struct rvt_qp __rcu *qp[2];
-       struct rvt_ibport rvp;
-
-       __be64 guids[HFI1_GUIDS_PER_PORT        - 1];   /* writable GUIDs */
-
-       /* the first 16 entries are sl_to_vl for !OPA */
-       u8 sl_to_sc[32];
-       u8 sc_to_sl[32];
-};
-
-struct hfi1_ibdev {
-       struct rvt_dev_info rdi; /* Must be first */
-
-       /* QP numbers are shared by all IB ports */
-       /* protect wait lists */
-       seqlock_t iowait_lock;
-       struct list_head txwait;        /* list for wait verbs_txreq */
-       struct list_head memwait;       /* list for wait kernel memory */
-       struct list_head txreq_free;
-       struct kmem_cache *verbs_txreq_cache;
-       struct timer_list mem_timer;
-
-       u64 n_piowait;
-       u64 n_piodrain;
-       u64 n_txwait;
-       u64 n_kmem_wait;
-
-#ifdef CONFIG_DEBUG_FS
-       /* per HFI debugfs */
-       struct dentry *hfi1_ibdev_dbg;
-       /* per HFI symlinks to above */
-       struct dentry *hfi1_ibdev_link;
-#endif
-};
-
-static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
-{
-       struct rvt_dev_info *rdi;
-
-       rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
-       return container_of(rdi, struct hfi1_ibdev, rdi);
-}
-
-static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
-{
-       struct hfi1_qp_priv *priv;
-
-       priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
-       return priv->owner;
-}
-
-/*
- * Send if not busy or waiting for I/O and either
- * a RC response is pending or we can process send work requests.
- */
-static inline int hfi1_send_ok(struct rvt_qp *qp)
-{
-       return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
-               (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
-                !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
-}
-
-/*
- * This must be called with s_lock held.
- */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-                   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
-void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
-void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
-void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
-int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index);
-
-/*
- * The PSN_MASK and PSN_SHIFT allow for
- * 1) comparing two PSNs
- * 2) returning the PSN with any upper bits masked
- * 3) returning the difference between to PSNs
- *
- * The number of significant bits in the PSN must
- * necessarily be at least one bit less than
- * the container holding the PSN.
- */
-#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
-#define PSN_MASK 0xFFFFFF
-#define PSN_SHIFT 8
-#else
-#define PSN_MASK 0x7FFFFFFF
-#define PSN_SHIFT 1
-#endif
-#define PSN_MODIFY_MASK 0xFFFFFF
-
-/*
- * Compare the lower 24 bits of the msn values.
- * Returns an integer <, ==, or > than zero.
- */
-static inline int cmp_msn(u32 a, u32 b)
-{
-       return (((int)a) - ((int)b)) << 8;
-}
-
-/*
- * Compare two PSNs
- * Returns an integer <, ==, or > than zero.
- */
-static inline int cmp_psn(u32 a, u32 b)
-{
-       return (((int)a) - ((int)b)) << PSN_SHIFT;
-}
-
-/*
- * Return masked PSN
- */
-static inline u32 mask_psn(u32 a)
-{
-       return a & PSN_MASK;
-}
-
-/*
- * Return delta between two PSNs
- */
-static inline u32 delta_psn(u32 a, u32 b)
-{
-       return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
-}
-
-struct verbs_txreq;
-void hfi1_put_txreq(struct verbs_txreq *tx);
-
-int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
-                  int release, int copy_last);
-
-void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release);
-
-void hfi1_cnp_rcv(struct hfi1_packet *packet);
-
-void hfi1_uc_rcv(struct hfi1_packet *packet);
-
-void hfi1_rc_rcv(struct hfi1_packet *packet);
-
-void hfi1_rc_hdrerr(
-       struct hfi1_ctxtdata *rcd,
-       struct hfi1_ib_header *hdr,
-       u32 rcv_flags,
-       struct rvt_qp *qp);
-
-u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
-
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
-
-void hfi1_rc_rnr_retry(unsigned long arg);
-void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to);
-void hfi1_rc_timeout(unsigned long arg);
-void hfi1_del_timers_sync(struct rvt_qp *qp);
-void hfi1_stop_rc_timers(struct rvt_qp *qp);
-
-void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr);
-
-void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
-
-void hfi1_ud_rcv(struct hfi1_packet *packet);
-
-int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
-
-int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
-
-void hfi1_migrate_qp(struct rvt_qp *qp);
-
-int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                        int attr_mask, struct ib_udata *udata);
-
-void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata);
-
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
-
-extern const u32 rc_only_opcode;
-extern const u32 uc_only_opcode;
-
-static inline u8 get_opcode(struct hfi1_ib_header *h)
-{
-       u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
-
-       if (lnh == IB_LNH_IBA_LOCAL)
-               return be32_to_cpu(h->u.oth.bth[0]) >> 24;
-       else
-               return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
-}
-
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
-                      int has_grh, struct rvt_qp *qp, u32 bth0);
-
-u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
-                 struct ib_global_route *grh, u32 hwords, u32 nwords);
-
-void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
-                         u32 bth0, u32 bth2, int middle,
-                         struct hfi1_pkt_state *ps);
-
-void _hfi1_do_send(struct work_struct *work);
-
-void hfi1_do_send(struct rvt_qp *qp);
-
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-                       enum ib_wc_status status);
-
-void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn);
-
-int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-
-int hfi1_register_ib_device(struct hfi1_devdata *);
-
-void hfi1_unregister_ib_device(struct hfi1_devdata *);
-
-void hfi1_ib_rcv(struct hfi1_packet *packet);
-
-unsigned hfi1_get_npkeys(struct hfi1_devdata *);
-
-int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc);
-
-int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
-                       u64 pbc);
-
-int hfi1_wss_init(void);
-void hfi1_wss_exit(void);
-
-/* platform specific: return the lowest level cache (llc) size, in KiB */
-static inline int wss_llc_size(void)
-{
-       /* assume that the boot CPU value is universal for all CPUs */
-       return boot_cpu_data.x86_cache_size;
-}
-
-/* platform specific: cacheless copy */
-static inline void cacheless_memcpy(void *dst, void *src, size_t n)
-{
-       /*
-        * Use the only available X64 cacheless copy.  Add a __user cast
-        * to quiet sparse.  The src agument is already in the kernel so
-        * there are no security issues.  The extra fault recovery machinery
-        * is not invoked.
-        */
-       __copy_user_nocache(dst, (void __user *)src, n, 0);
-}
-
-extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
-
-extern const u8 hdr_len_by_opcode[];
-
-extern const int ib_rvt_state_ops[];
-
-extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
-
-extern unsigned int hfi1_max_cqes;
-
-extern unsigned int hfi1_max_cqs;
-
-extern unsigned int hfi1_max_qp_wrs;
-
-extern unsigned int hfi1_max_qps;
-
-extern unsigned int hfi1_max_sges;
-
-extern unsigned int hfi1_max_mcast_grps;
-
-extern unsigned int hfi1_max_mcast_qp_attached;
-
-extern unsigned int hfi1_max_srqs;
-
-extern unsigned int hfi1_max_srq_sges;
-
-extern unsigned int hfi1_max_srq_wrs;
-
-extern unsigned short piothreshold;
-
-extern const u32 ib_hfi1_rnr_table[];
-
-#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.c b/drivers/staging/rdma/hfi1/verbs_txreq.c
deleted file mode 100644 (file)
index bc95c41..0000000
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "hfi.h"
-#include "verbs_txreq.h"
-#include "qp.h"
-#include "trace.h"
-
-#define TXREQ_LEN 24
-
-void hfi1_put_txreq(struct verbs_txreq *tx)
-{
-       struct hfi1_ibdev *dev;
-       struct rvt_qp *qp;
-       unsigned long flags;
-       unsigned int seq;
-       struct hfi1_qp_priv *priv;
-
-       qp = tx->qp;
-       dev = to_idev(qp->ibqp.device);
-
-       if (tx->mr)
-               rvt_put_mr(tx->mr);
-
-       sdma_txclean(dd_from_dev(dev), &tx->txreq);
-
-       /* Free verbs_txreq and return to slab cache */
-       kmem_cache_free(dev->verbs_txreq_cache, tx);
-
-       do {
-               seq = read_seqbegin(&dev->iowait_lock);
-               if (!list_empty(&dev->txwait)) {
-                       struct iowait *wait;
-
-                       write_seqlock_irqsave(&dev->iowait_lock, flags);
-                       wait = list_first_entry(&dev->txwait, struct iowait,
-                                               list);
-                       qp = iowait_to_qp(wait);
-                       priv = qp->priv;
-                       list_del_init(&priv->s_iowait.list);
-                       /* refcount held until actual wake up */
-                       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
-                       hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
-                       break;
-               }
-       } while (read_seqretry(&dev->iowait_lock, seq));
-}
-
-struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
-                               struct rvt_qp *qp)
-{
-       struct verbs_txreq *tx = ERR_PTR(-EBUSY);
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       write_seqlock(&dev->iowait_lock);
-       if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
-               struct hfi1_qp_priv *priv;
-
-               tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
-               if (tx)
-                       goto out;
-               priv = qp->priv;
-               if (list_empty(&priv->s_iowait.list)) {
-                       dev->n_txwait++;
-                       qp->s_flags |= RVT_S_WAIT_TX;
-                       list_add_tail(&priv->s_iowait.list, &dev->txwait);
-                       trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
-                       atomic_inc(&qp->refcount);
-               }
-               qp->s_flags &= ~RVT_S_BUSY;
-       }
-out:
-       write_sequnlock(&dev->iowait_lock);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return tx;
-}
-
-static void verbs_txreq_kmem_cache_ctor(void *obj)
-{
-       struct verbs_txreq *tx = (struct verbs_txreq *)obj;
-
-       memset(tx, 0, sizeof(*tx));
-}
-
-int verbs_txreq_init(struct hfi1_ibdev *dev)
-{
-       char buf[TXREQ_LEN];
-       struct hfi1_devdata *dd = dd_from_dev(dev);
-
-       snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
-       dev->verbs_txreq_cache = kmem_cache_create(buf,
-                                                  sizeof(struct verbs_txreq),
-                                                  0, SLAB_HWCACHE_ALIGN,
-                                                  verbs_txreq_kmem_cache_ctor);
-       if (!dev->verbs_txreq_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void verbs_txreq_exit(struct hfi1_ibdev *dev)
-{
-       kmem_cache_destroy(dev->verbs_txreq_cache);
-       dev->verbs_txreq_cache = NULL;
-}
diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.h b/drivers/staging/rdma/hfi1/verbs_txreq.h
deleted file mode 100644 (file)
index 1cf69b2..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef HFI1_VERBS_TXREQ_H
-#define HFI1_VERBS_TXREQ_H
-
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include "verbs.h"
-#include "sdma_txreq.h"
-#include "iowait.h"
-
-struct verbs_txreq {
-       struct hfi1_pio_header  phdr;
-       struct sdma_txreq       txreq;
-       struct rvt_qp           *qp;
-       struct rvt_swqe         *wqe;
-       struct rvt_mregion      *mr;
-       struct rvt_sge_state    *ss;
-       struct sdma_engine     *sde;
-       struct send_context     *psc;
-       u16                     hdr_dwords;
-};
-
-struct hfi1_ibdev;
-struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
-                               struct rvt_qp *qp);
-
-static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
-                                           struct rvt_qp *qp)
-{
-       struct verbs_txreq *tx;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
-       if (unlikely(!tx)) {
-               /* call slow path to get the lock */
-               tx = __get_txreq(dev, qp);
-               if (IS_ERR(tx))
-                       return tx;
-       }
-       tx->qp = qp;
-       tx->mr = NULL;
-       tx->sde = priv->s_sde;
-       tx->psc = priv->s_sendcontext;
-       /* so that we can test if the sdma decriptors are there */
-       tx->txreq.num_desc = 0;
-       return tx;
-}
-
-static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
-{
-       return &tx->txreq;
-}
-
-static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
-{
-       struct sdma_txreq *stx;
-       struct hfi1_qp_priv *priv = qp->priv;
-
-       stx = iowait_get_txhead(&priv->s_iowait);
-       if (stx)
-               return container_of(stx, struct verbs_txreq, txreq);
-       return NULL;
-}
-
-void hfi1_put_txreq(struct verbs_txreq *tx);
-int verbs_txreq_init(struct hfi1_ibdev *dev);
-void verbs_txreq_exit(struct hfi1_ibdev *dev);
-
-#endif                         /* HFI1_VERBS_TXREQ_H */
index 0f3daae..b13419c 100644 (file)
@@ -103,6 +103,9 @@ enum {
        IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
        IB_OPCODE_COMPARE_SWAP                      = 0x13,
        IB_OPCODE_FETCH_ADD                         = 0x14,
+       /* opcode 0x15 is reserved */
+       IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
+       IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
 
        /* real constants follow -- see comment about above IB_OPCODE()
           macro for more details */
@@ -129,6 +132,8 @@ enum {
        IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
        IB_OPCODE(RC, COMPARE_SWAP),
        IB_OPCODE(RC, FETCH_ADD),
+       IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+       IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
 
        /* UC */
        IB_OPCODE(UC, SEND_FIRST),
index d57ceee..16274e2 100644 (file)
@@ -149,15 +149,15 @@ struct rvt_driver_params {
        int qpn_res_end;
        int nports;
        int npkeys;
-       u8 qos_shift;
        char cq_name[RVT_CQN_MAX];
        int node;
-       int max_rdma_atomic;
        int psn_mask;
        int psn_shift;
        int psn_modify_mask;
        u32 core_cap_flags;
        u32 max_mad_size;
+       u8 qos_shift;
+       u8 max_rdma_atomic;
 };
 
 /* Protection domain */
@@ -425,6 +425,15 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
        return rdi->dparms.npkeys;
 }
 
+/*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+       return rdi->dparms.max_rdma_atomic + 1;
+}
+
 /*
  * Return the indexed PKEY from the port PKEY table.
  */
index 0e1ff2a..6d23b87 100644 (file)
@@ -211,8 +211,6 @@ struct rvt_mmap_info {
        unsigned size;
 };
 
-#define RVT_MAX_RDMA_ATOMIC    16
-
 /*
  * This structure holds the information that the send tasklet needs
  * to send a RDMA read response or atomic operation.
@@ -282,8 +280,7 @@ struct rvt_qp {
        atomic_t refcount ____cacheline_aligned_in_smp;
        wait_queue_head_t wait;
 
-       struct rvt_ack_entry s_ack_queue[RVT_MAX_RDMA_ATOMIC + 1]
-               ____cacheline_aligned_in_smp;
+       struct rvt_ack_entry *s_ack_queue;
        struct rvt_sge_state s_rdma_read_sge;
 
        spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
index a533cec..98bebf8 100644 (file)
@@ -66,7 +66,7 @@
  * The major version changes when data structures change in an incompatible
  * way. The driver must be the same for initialization to succeed.
  */
-#define HFI1_USER_SWMAJOR 5
+#define HFI1_USER_SWMAJOR 6
 
 /*
  * Minor version differences are always compatible
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 0
+#define HFI1_USER_SWMINOR 1
+
+/*
+ * We will encode the major/minor inside a single 32bit version number.
+ */
+#define HFI1_SWMAJOR_SHIFT 16
 
 /*
  * Set of HW and driver capability/feature bits.
 #define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
 #define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
 
-/*
- * If the unit is specified via open, HFI choice is fixed.  If port is
- * specified, it's also fixed.  Otherwise we try to spread contexts
- * across ports and HFIs, using different algorithms.  WITHIN is
- * the old default, prior to this mechanism.
- */
-#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
-                         * ports; this is the default */
-#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
-                         * active ports within), then next HFI */
-#define HFI1_ALG_COUNT  2 /* number of algorithm choices */
-
-
 /* User commands. */
 #define HFI1_CMD_ASSIGN_CTXT     1     /* allocate HFI and context */
 #define HFI1_CMD_CTXT_INFO       2     /* find out what resources we got */
 #define HFI1_CMD_TID_UPDATE      4     /* update expected TID entries */
 #define HFI1_CMD_TID_FREE        5     /* free expected TID entries */
 #define HFI1_CMD_CREDIT_UPD      6     /* force an update of PIO credit */
-#define HFI1_CMD_SDMA_STATUS_UPD 7      /* force update of SDMA status ring */
 
 #define HFI1_CMD_RECV_CTRL       8     /* control receipt of packets */
 #define HFI1_CMD_POLL_TYPE       9     /* set the kind of polling we want */
 #define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
 #define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
 #define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
-/* separate EPROM commands from normal PSM commands */
-#define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
-#define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
-/* range 66-74 no longer used */
-#define HFI1_CMD_EP_ERASE_RANGE  75      /* erase EPROM range */
-#define HFI1_CMD_EP_READ_RANGE   76      /* read EPROM range */
-#define HFI1_CMD_EP_WRITE_RANGE  77      /* write EPROM range */
+#define HFI1_CMD_GET_VERS       14     /* get the version of the user cdev */
+
+/*
+ * User IOCTLs can not go above 128 if they do then see common.h and change the
+ * base for the snoop ioctl
+ */
+#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */
+
+/*
+ * Make the ioctls occupy the last 0xf0-0xff portion of the IB range
+ */
+#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0)
+
+struct hfi1_cmd;
+#define HFI1_IOCTL_ASSIGN_CTXT \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info)
+#define HFI1_IOCTL_CTXT_INFO \
+       _IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info)
+#define HFI1_IOCTL_USER_INFO \
+       _IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info)
+#define HFI1_IOCTL_TID_UPDATE \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info)
+#define HFI1_IOCTL_TID_FREE \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info)
+#define HFI1_IOCTL_CREDIT_UPD \
+       _IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD))
+#define HFI1_IOCTL_RECV_CTRL \
+       _IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int)
+#define HFI1_IOCTL_POLL_TYPE \
+       _IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int)
+#define HFI1_IOCTL_ACK_EVENT \
+       _IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long)
+#define HFI1_IOCTL_SET_PKEY \
+       _IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16)
+#define HFI1_IOCTL_CTXT_RESET \
+       _IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET))
+#define HFI1_IOCTL_TID_INVAL_READ \
+       _IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info)
+#define HFI1_IOCTL_GET_VERS \
+       _IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int)
 
 #define _HFI1_EVENT_FROZEN_BIT         0
 #define _HFI1_EVENT_LINKDOWN_BIT       1
@@ -199,9 +223,7 @@ struct hfi1_user_info {
         * Should be set to HFI1_USER_SWVERSION.
         */
        __u32 userversion;
-       __u16 pad;
-       /* HFI selection algorithm, if unit has not selected */
-       __u16 hfi1_alg;
+       __u32 pad;
        /*
         * If two or more processes wish to share a context, each process
         * must set the subcontext_cnt and subcontext_id to the same
@@ -243,12 +265,6 @@ struct hfi1_tid_info {
        __u32 length;
 };
 
-struct hfi1_cmd {
-       __u32 type;        /* command type */
-       __u32 len;         /* length of struct pointed to by add */
-       __u64 addr;        /* pointer to user structure */
-};
-
 enum hfi1_sdma_comp_state {
        FREE = 0,
        QUEUED,